set.seed(123)
Homoscedastic Error Variances
For the visualization of homocedastic and hetercedastic variance, we
simulate:
\[
y \sim N \left ( -1 + 2x, 1 \right )
\]
The funnel-shaped heterocedasticy is simulated using:
\[
y \sim N \left ( -1 + 2x, \left ( 0.1 + 0.3 \left ( x + 3 \right )
\right )^{2} \right )
\]
par(mfrow = c(2, 2))
# Homecedastic variance
X <- seq(-2, 2, length.out = 300)
homecedastic_random <- function (x){
rnorm(1, mean = -1 + 2 * x, sd = 1)[1]
}
homo_y <- sapply(X, homecedastic_random)
homo_e <- (-1 + 2 * X) - homo_y
plot(X, homo_y, xlab='', ylab='', main='homocedastic variance')
abline(-1, 2, col = "red", lwd=2)
plot(X, homo_e, xlab='', ylab='', main='homocedastic variance errors')
abline(0, 0, col = "red", lwd=2)
heterocedastic_random <- function (x){
rnorm(1, mean = -1 + 2 * x, sd = 0.1 + 0.3 * (x + 3))[1]
}
hetero_y <- sapply(X, heterocedastic_random)
hetero_e <- (-1 + 2 * X) - hetero_y
plot(X, hetero_y, xlab='', ylab='', main='funnel-shaped heterocedastic variance')
abline(-1, 2, col = "red", lwd=2)
plot(X, hetero_e, xlab='', ylab='', main='funnel-shaped heterocedastic variance errors')
abline(0, 0, col = "red", lwd=2)

Example 3.1 Munich Rent Index—Heteroscedastic Variances
par(mfrow = c(1, 2))
munich_rent_url <- "https://www.uni-goettingen.de/de/document/download/64c29c1b1fccb142cfa8f29a942a9e05.raw/rent99.raw"
munich_rent_index <- read.table(
url(munich_rent_url),
header = 1,
colClasses = c(
"numeric", "numeric", "numeric",
"numeric", "factor", "factor",
"factor", "factor", "factor"
)
)
simple_reg <- lm(rent ~ area, data = munich_rent_index)
plot(munich_rent_index$area, munich_rent_index$rent, xlab = 'area in sqm', ylab = 'net rent in Euro')
abline(simple_reg, col = "red", lwd = 2)
predicted <- predict(simple_reg)
plot(munich_rent_index$area, predicted - munich_rent_index$rent, xlab = 'area in sqm', ylab = 'residuals')
abline(0, 0, col = "red", lwd = 2)

Uncorrelated Errors
Simulated autocorrelated errors with positive correlation are
generated using:
\[
y_{i} = -1 + 2 x_{i} + \epsilon_{i}
\] \[
\text{Where } \epsilon_{i} = 0.9 \epsilon_{i - 1} + \mu_{i} \wedge
\mu_{i} \sim N(0, 0.5^{2})
\]
par(mfrow = c(1, 2))
n <- 100
X <- seq(-3, 3, length.out = n)
mu <- rnorm(n, mean = 0, sd = 0.5)
epsilon <- mu[1]
for(i in seq(2, n)){
epsilon <- c(epsilon, 0.9 * epsilon[i - 1] + mu[i])
}
y_pred <- -1 + 2 * X + epsilon
plot(X, y_pred, xlab = 'x', ylab = '', main = 'observations and regression line')
abline(-1, 2, col = "red", lwd = 2)
plot(epsilon, xlab = 'i', ylab = '', main = 'time series of the errors')
abline(0, 0, col = "red", lwd = 2)

Simulated autocorrelated errors with negative correlation are
generated using:
\[
y_{i} = -1 + 2 x_{i} + \epsilon_{i}
\] \[
\text{Where } \epsilon_{i} = - 0.9 \epsilon_{i - 1} + \mu_{i} \wedge
\mu_{i} \sim N(0, 0.5^{2})
\]
par(mfrow = c(1, 2))
n <- 100
X <- seq(-3, 3, length.out = n)
mu <- rnorm(n, mean = 0, sd = 0.5)
epsilon <- mu[1]
for(i in seq(2, n)){
epsilon <- c(epsilon, -0.9 * epsilon[i - 1] + mu[i])
}
y_pred <- -1 + 2 * X + epsilon
plot(X, y_pred, xlab = 'x', ylab = '', main = 'observations and regression line')
abline(-1, 2, col = "red", lwd = 2)
plot(epsilon, xlab = 'i', ylab = '', main = 'time series of the errors')
abline(0, 0, col = "red", lwd = 2)

Mispecified model is simulated using:
\[
y_{i} = \sin(x_{i}) + x_{i} + \epsilon_{i}
\] \[
\text{Where } \epsilon_{i} \sim N(0, 0.3^{2})
\]
par(mfrow = c(2, 2))
n <- 100
X <- seq(-3, 3, length.out = n)
epsilon <- rnorm(n, mean = 0, sd = 0.3)
y <- sin(X) + X + epsilon
plot(X, y, xlab = '', ylab = '', main = 'observations and true function')
lines(X, sin(X) + X, col = "red", lwd = 2)
linear_model <- lm(y ~ X)
plot(X, y, xlab = '', ylab = '', main = 'observations and regression line')
abline(linear_model, col = "red", lwd = 2)
y_pred <- predict(linear_model, data = X)
res <- y_pred - y
plot(X, res, xlab = '', ylab = '', main = 'residuals')
abline(0, 0, col = "red", lwd = 2)

Additivity of Errors
We simulate data using:
\[
y_{i} = \exp(1 + x_{i1} - x_{i2} + \epsilon_{i})
\]
With: \[
\epsilon_{i} \sim N \left ( 0, {0.4}^{2} \right )
\]
We plot this model:
par(mfrow = c(2, 2))
n <- 50
x1 <- seq(0, 3, length.out = n)
x2 <- seq(0, 3, length.out = n)
x_grid <- expand.grid(x1 = x1, x2 = x2)
epsilon <- rnorm(n^2, mean = 0, sd = 0.4)
y <- exp(1 + x_grid$x1 - x_grid$x2 + epsilon)
plot(
x_grid$x1, y,
main = 'scatter plot: y versus x1',
xlab = 'x1',
ylab = 'y'
)
plot(
x_grid$x2, y,
main = 'scatter plot: y versus x2',
xlab = 'x2',
ylab = 'y'
)
plot(
x_grid$x1, log(y),
main = 'scatter plot: log(y) versus x1',
xlab = 'x1',
ylab = 'log(y)'
)
plot(
x_grid$x2, log(y),
main = 'scatter plot: log(y) versus x2',
xlab = 'x2',
ylab = 'log(y)'
)

Example 3.2 Supermarket Scanner Data
Data not available online.
Example 3.3 Munich Rent Index — Variable Transformation
Importing data using script:
source("import_data/munich_rent_index.R")
We do the regression model:
\[
\text{rentsqm}_{i} = \beta_{0} + \beta_{1} \cdot f(\text{area}_{i}) +
\epsilon_{i}
\]
With both \(f(\cdot)\):
\[
f(\text{area}_{i}) = \text{area}_{i}
\]
For linear model and:
\[
f(\text{area}_{i}) = \frac{1}{\text{area}_{i}}
\]
For future use we define those models:
Model 1 (\(M1\)):
\[
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area}
\]
Model 2 (\(M2\)):
\[
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \frac{1}{\text{area}}
\]
For nonlinear relationship between \(\text{rentsqm}\) and \(\text{area}\). The plotted graph below are
those two models with the left panels the linear regression without the
transformation and the right panels with the inverse transformation.
par(mfrow = c(3, 2))
m1 <- lm(rentsqm ~ area, data = munich_rent_index)
summary(m1)
Call:
lm(formula = rentsqm ~ area, data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9622 -1.5737 -0.1102 1.5861 9.9674
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.46883 0.12426 76.20 <2e-16 ***
area -0.03499 0.00174 -20.11 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.291 on 3080 degrees of freedom
Multiple R-squared: 0.1161, Adjusted R-squared: 0.1158
F-statistic: 404.5 on 1 and 3080 DF, p-value: < 2.2e-16
m2 <- lm(rentsqm ~ I(1 / area), data = munich_rent_index)
summary(m2)
Call:
lm(formula = rentsqm ~ I(1/area), data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.3963 -1.5733 -0.0921 1.5824 10.1287
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.7321 0.1084 43.65 <2e-16 ***
I(1/area) 140.1783 5.9287 23.64 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.241 on 3080 degrees of freedom
Multiple R-squared: 0.1536, Adjusted R-squared: 0.1533
F-statistic: 559 on 1 and 3080 DF, p-value: < 2.2e-16
plot_rentsqm_area <- function(title) {
plot(
munich_rent_index$area,
munich_rent_index$rentsqm,
main = title,
xlab = 'area in sqm',
ylab = 'rent per sqm'
)
}
seq_area <- seq(min(munich_rent_index$area), max(munich_rent_index$area), length.out = 100)
plot_rentsqm_area('rent per sqm vs. area')
m1_beta <- m1$coefficients
lines(seq_area, m1_beta[1] + m1_beta[2] * seq_area, col = 'red', lwd = 2)
plot_rentsqm_area('f(area) = 1/area')
m2_beta <- m2$coefficients
lines(seq_area, m2_beta[1] + m2_beta[2] * 1 / seq_area, col = 'red', lwd = 2)
plot_residuals <- function(model) {
plot(
munich_rent_index$area,
model$residuals,
main = 'residuals',
xlab = 'area in sqm',
ylab = 'residuals'
)
abline(0, 0, col = 'red', lwd = 2)
}
plot_residuals(m1)
plot_residuals(m2)
plot_av_residuals <- function(model) {
av_residuals <- tapply(
model$residuals, munich_rent_index$area,
mean
)
plot(
unique(munich_rent_index$area),
av_residuals,
main = 'average residuals',
xlab = 'area in sqm',
ylab = 'residuals'
)
abline(0, 0, col = 'red', lwd = 2)
}
plot_av_residuals(m1)
plot_av_residuals(m2)

Example 3.4 Munich Rent Index — Polynomial Regression
For polynomial regressions we adjust two different model. Of
second-order (Model 3 (\(M3\))):
\[
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} +
\beta_{2} \text{area}^{2}
\]
And third-order (Model 4 (\(M4\))):
\[
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} +
\beta_{2} \text{area}^{2} + \beta_{3} \text{area}^{3}
\]
par(mfrow = c(2, 2))
m3 <- lm(rentsqm ~ area + I(area^2), data = munich_rent_index)
summary(m3)
Call:
lm(formula = rentsqm ~ area + I(area^2), data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.2150 -1.5816 -0.0915 1.5869 9.9392
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.183e+01 2.684e-01 44.081 <2e-16 ***
area -1.057e-01 7.351e-03 -14.380 <2e-16 ***
I(area^2) 4.707e-04 4.758e-05 9.892 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.255 on 3079 degrees of freedom
Multiple R-squared: 0.1433, Adjusted R-squared: 0.1428
F-statistic: 257.6 on 2 and 3079 DF, p-value: < 2.2e-16
m4 <- lm(rentsqm ~ area + I(area^2) + I(area^3), data = munich_rent_index)
summary(m4)
Call:
lm(formula = rentsqm ~ area + I(area^2) + I(area^3), data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.4269 -1.5854 -0.1101 1.5948 10.0670
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.428e+01 5.730e-01 24.915 < 2e-16 ***
area -2.175e-01 2.432e-02 -8.946 < 2e-16 ***
I(area^2) 1.981e-03 3.167e-04 6.254 4.54e-10 ***
I(area^3) -6.105e-06 1.266e-06 -4.823 1.49e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.247 on 3078 degrees of freedom
Multiple R-squared: 0.1497, Adjusted R-squared: 0.1489
F-statistic: 180.7 on 3 and 3078 DF, p-value: < 2.2e-16
plot_rentsqm_area('rent per sqm vs. area')
m3_beta <- m3$coefficients
lines(seq_area, m3_beta[1] + m3_beta[2] * seq_area + m3_beta[3] * seq_area ^ 2, col = 'red', lwd = 2)
plot_rentsqm_area('f(area) = 1/area')
m4_beta <- m4$coefficients
lines(seq_area, m4_beta[1] + m4_beta[2] * seq_area + m4_beta[3] * seq_area ^ 2 + m4_beta[4] * seq_area ^ 3, col = 'red', lwd = 2)
plot_residuals(m3)
plot_residuals(m4)

Example 3.5 Munich Rent Index — Additive Models
We define the following aditive models. Additive model 1:
\[
\mathcal{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot
\frac{1}{\text{area}_{i}} + \beta_{2} \cdot \text{yearc}_{i}
\]
And a second one with \(\text{yearc}\) as a polinomial of degree
3:
\[
\mathcal{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot
\frac{1}{\text{area}_{i}} + \beta_{2} \cdot \text{yearc}_{i} + \beta_{3}
\cdot {\text{yearc}_{i}}^{2} + \beta_{4} \cdot {\text{yearc}_{i}}^{3}
\]
additive_m1 <- lm(rentsqm ~ I(1/area) + yearc, data = munich_rent_index)
summary(additive_m1)
Call:
lm(formula = rentsqm ~ I(1/area) + yearc, data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-7.2682 -1.4894 -0.1401 1.3935 9.0582
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -65.406008 3.351088 -19.52 <2e-16 ***
I(1/area) 119.360735 5.636182 21.18 <2e-16 ***
yearc 0.036033 0.001721 20.94 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.097 on 3079 degrees of freedom
Multiple R-squared: 0.2591, Adjusted R-squared: 0.2586
F-statistic: 538.5 on 2 and 3079 DF, p-value: < 2.2e-16
additive_m2 <- lm(rentsqm ~ I(1/area) + yearc + I(yearc ^ 2)+ I(yearc ^ 3), data = munich_rent_index)
summary(additive_m2)
Call:
lm(formula = rentsqm ~ I(1/area) + yearc + I(yearc^2) + I(yearc^3),
data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9920 -1.3705 -0.1354 1.3711 8.2834
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.942e+04 2.993e+04 0.983 0.326
I(1/area) 1.296e+02 5.536e+00 23.406 <2e-16 ***
yearc -4.330e+01 4.592e+01 -0.943 0.346
I(yearc^2) 2.119e-02 2.348e-02 0.902 0.367
I(yearc^3) -3.445e-06 4.002e-06 -0.861 0.389
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.039 on 3077 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2991
F-statistic: 329.7 on 4 and 3077 DF, p-value: < 2.2e-16
A third model is specified using the truncated year of construction
(\(\text{yearc} - 1900\)):
munich_rent_index$yearco <- munich_rent_index$yearc - 1900
additive_m3 <- lm(rentsqm ~ I(1/area) + yearco + I(yearco ^ 2)+ I(yearco ^ 3), data = munich_rent_index)
summary(additive_m3)
Call:
lm(formula = rentsqm ~ I(1/area) + yearco + I(yearco^2) + I(yearco^3),
data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9920 -1.3705 -0.1354 1.3711 8.2834
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.417e+00 4.779e-01 11.335 < 2e-16 ***
I(1/area) 1.296e+02 5.536e+00 23.406 < 2e-16 ***
yearco -9.434e-02 3.384e-02 -2.788 0.00534 **
I(yearco^2) 1.553e-03 6.728e-04 2.308 0.02105 *
I(yearco^3) -3.445e-06 4.002e-06 -0.861 0.38947
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.039 on 3077 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2991
F-statistic: 329.7 on 4 and 3077 DF, p-value: < 2.2e-16
As in the book, we show the effect of year of construction in the
polynomial of degree 3:
par(mfrow = c(1, 2))
beta_add_m2 <- additive_m2$coefficients
yearc_seq <- seq(min(munich_rent_index$yearc), max(munich_rent_index$yearc), length.out = 100)
plot_effect <- function(beta, year_seq, title){
plot(
year_seq,
beta[3] * year_seq + beta[4] * year_seq^2 + beta[5] * year_seq^3,
main = title,
xlab = 'year of construction',
ylab = 'effect of year of construction',
type = 'l',
col = 'darkblue',
lwd = 2
)
}
plot_effect(beta_add_m2, yearc_seq, 'effect of year of construction')
beta_add_m3 <- additive_m3$coefficients
truncate_yearc_seq <- seq(min(munich_rent_index$yearco), max(munich_rent_index$yearco), length.out = 100)
plot_effect(beta_add_m3, truncate_yearc_seq, 'effect of year of construction, coded from 18 to 96')

A new model is defined using the orthogonal polynomial of year of
construction:
munich_rent_index$areainv <- (1 / munich_rent_index$area) - mean(1 / munich_rent_index$area)
poly_year <- poly(munich_rent_index$yearco, 3)
munich_rent_index$yearcc <- poly_year[, 1]
munich_rent_index$yearcc2 <- poly_year[, 2]
munich_rent_index$yearcc3 <- poly_year[, 3]
additive_m4 <- lm(rentsqm ~ areainv + yearcc + yearcc2 + yearcc3, data = munich_rent_index)
summary(additive_m4)
Call:
lm(formula = rentsqm ~ areainv + yearcc + yearcc2 + yearcc3,
data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.9920 -1.3705 -0.1354 1.3711 8.2834
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.11126 0.03674 193.575 <2e-16 ***
areainv 129.57166 5.53582 23.406 <2e-16 ***
yearcc 43.93838 2.07260 21.200 <2e-16 ***
yearcc2 27.53892 2.05958 13.371 <2e-16 ***
yearcc3 -1.75582 2.03998 -0.861 0.389
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.039 on 3077 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2991
F-statistic: 329.7 on 4 and 3077 DF, p-value: < 2.2e-16
Now we calculate the partial residuals for \(\text{area}\) and \(\text{yearc}\) define by:
\[
\hat{\epsilon}_{\text{area}, i} = \text{rentsqm}_{i} - \hat{\beta_{0}} -
\hat{f}_{2}(\text{yearc}_{i})
\]
With the effect \(f(\text{yearc}_{i})\):
\[
\hat{f}_{2}(\text{yearc}) = \beta_{2} \cdot \text{yearc}_{i} + \beta_{3}
\cdot {\text{yearc}_{i}}^{2} + \beta_{4} \cdot {\text{yearc}_{i}}^{3}
\]
And:
\[
\hat{\epsilon}_{\text{yearc}, i} = \text{rentsqm}_{i} - \hat{\beta_{0}}
- \hat{f}_{1}(\text{area}_{i})
\]
With the effect \(f(\text{yearc}_{i})\):
\[
\hat{f}_{1}(\text{yearc}) = \beta_{1} \cdot \frac{1}{\text{area}_{i}}
\]
par(mfrow = c(1, 2))
beta_add_m4 <- additive_m4$coefficients
area_seq <- seq(min(munich_rent_index$area), max(munich_rent_index$area), length.out = 100)
area_effect <- function(area){
inv_area <- 1 / area
cent_area <- inv_area - mean(inv_area)
beta_add_m4[2] * cent_area
}
yearc_effect <- function(yearc){
poly_yearc <- predict(poly_year, yearc - 1900)
beta_add_m4[3] * poly_yearc[, 1] + beta_add_m4[4] * poly_yearc[, 2] + beta_add_m4[5] * poly_yearc[, 3]
}
residual_area <- munich_rent_index$rentsqm - beta_add_m4[1] - yearc_effect(munich_rent_index$yearc)
plot(
munich_rent_index$area,
residual_area,
main = 'effect of area',
xlab = 'area in sqm',
ylab = 'effect of area / partial residuals'
)
lines(area_seq, area_effect(area_seq), col = 'red', lwd = 2)
residual_year <- munich_rent_index$rentsqm - beta_add_m4[1] - area_effect(munich_rent_index$area)
plot(
munich_rent_index$yearc,
residual_year,
main = 'effect of year of construction',
xlab = 'year of construction',
ylab = 'effect of year of construction / partial residuals'
)
lines(yearc_seq, yearc_effect(yearc_seq), col = 'red', lwd = 2)

Example 3.6 Munich Rent Index — Effect Coding
We adjust the regression model with the coding values of
location:
\[
\mathbb{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot
\text{glocation}_{1} + \beta_{2} \cdot \text{tlocation}_{1}
\]
munich_rent_index$glocation <- 0
munich_rent_index$glocation[munich_rent_index$location == 2] <- 1
munich_rent_index$glocation[munich_rent_index$location == 1] <- -1
munich_rent_index$tlocation <- 0
munich_rent_index$tlocation[munich_rent_index$location == 3] <- 1
munich_rent_index$tlocation[munich_rent_index$location == 1] <- -1
cod_model <- lm(
rentsqm ~ glocation + tlocation,
data = munich_rent_index
)
summary(cod_model)
Call:
lm(formula = rentsqm ~ glocation + tlocation, data = munich_rent_index)
Residuals:
Min 1Q Median 3Q Max
-6.8564 -1.8376 -0.1074 1.7157 10.4494
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.46704 0.09638 77.477 < 2e-16 ***
glocation -0.19479 0.10445 -1.865 0.062285 .
tlocation 0.70529 0.18558 3.800 0.000147 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.426 on 3079 degrees of freedom
Multiple R-squared: 0.008867, Adjusted R-squared: 0.008223
F-statistic: 13.77 on 2 and 3079 DF, p-value: 1.109e-06
Example 3.7 Munich Rent Index — Interaction with Quality of
Kitchen
We import the new updated model of 2001, available at official page of the
dataset as all the datasets:
munich_rent_url_01 <- "https://www.uni-goettingen.de/de/document/download/e00d039e9c1f28ab404b27b0b14931f1.raw/rent98_00.raw"
munich_rent_index_01 <- read.table(
url(munich_rent_url_01),
header = 1,
colClasses = c(
"numeric", "numeric", "numeric",
"numeric", "integer", "integer",
"integer", "integer", "integer",
"integer", "numeric", "numeric",
"numeric"
)
)
summary(munich_rent_index_01)
rent_euro rentsqm_euro area yearc
Min. : 12.83 Min. : 0.1841 Min. : 20.00 Min. :1918
1st Qu.: 320.86 1st Qu.: 5.2601 1st Qu.: 52.00 1st Qu.:1939
Median : 424.12 Median : 6.8614 Median : 66.00 Median :1960
Mean : 456.94 Mean : 7.0274 Mean : 67.92 Mean :1956
3rd Qu.: 552.40 3rd Qu.: 8.7292 3rd Qu.: 82.00 3rd Qu.:1972
Max. :1837.89 Max. :17.6688 Max. :243.00 Max. :1999
glocation tlocation nkitchen pkitchen
Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
Mean :0.3907 Mean :0.02516 Mean :0.09888 Mean :0.04004
3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
eboden year01 yearc2 yearc3
Min. :0.0000 Min. :0.0000 Min. :3678724 Min. :7.060e+09
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3759721 1st Qu.:7.290e+09
Median :0.0000 Median :0.0000 Median :3841600 Median :7.530e+09
Mean :0.2986 Mean :0.3257 Mean :3828362 Mean :7.493e+09
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:3888784 3rd Qu.:7.670e+09
Max. :1.0000 Max. :1.0000 Max. :3996001 Max. :7.990e+09
invarea
Min. :0.004115
1st Qu.:0.012195
Median :0.015152
Mean :0.016904
3rd Qu.:0.019231
Max. :0.050000
And we adjust the model:
\[
\begin{matrix}
\widehat{\text{rentsqm}}_{i} = & \beta_{0} + \beta_{1}
\text{areainvc}_{i} + \beta_{2} \text{yearco}_{i} + \beta_{3}
\text{yearco2}_{i} + \beta_{4} \text{yearco3}_{i} \\
& + \beta_{5} \text{year01}_{i} + \beta_{6} \text{nkitchen}_{i} +
\beta_{7} \text{pkitchen}_{i} \\
& + \beta_{8} \text{nkitchen}_{i} \cdot \text{year01}_{i} +
\beta_{9} \text{pkitchen}_{i} \cdot \text{year01}_{i}
\end{matrix}
\]
munich_rent_index_01$areainvc <- munich_rent_index_01$invarea - mean(munich_rent_index_01$invarea)
munich_rent_index_01$yearco <- munich_rent_index_01$yearc - mean(munich_rent_index_01$yearc)
poly_year_01 <- poly(munich_rent_index_01$yearco, 3)
munich_rent_index_01$yearco <- poly_year_01[, 1]
munich_rent_index_01$yearco2 <- poly_year_01[, 2]
munich_rent_index_01$yearco3 <- poly_year_01[, 3]
interaction_model <- lm(
rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 + year01 + nkitchen + pkitchen + nkitchen * year01 + pkitchen * year01,
data = munich_rent_index_01
)
summary(interaction_model)
Call:
lm(formula = rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 +
year01 + nkitchen + pkitchen + nkitchen * year01 + pkitchen *
year01, data = munich_rent_index_01)
Residuals:
Min 1Q Median 3Q Max
-7.6303 -1.2647 -0.1072 1.2414 10.5233
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.95424 0.03841 181.072 < 2e-16 ***
areainvc 123.71308 4.42630 27.950 < 2e-16 ***
yearco 49.42282 2.02589 24.396 < 2e-16 ***
yearco2 29.45781 2.02386 14.555 < 2e-16 ***
yearco3 -1.03886 1.97176 -0.527 0.598311
year01 -0.25174 0.06678 -3.770 0.000166 ***
nkitchen 0.91567 0.12172 7.523 6.43e-14 ***
pkitchen 1.09081 0.17849 6.111 1.07e-09 ***
year01:nkitchen 0.39826 0.20922 1.904 0.057033 .
year01:pkitchen 0.73511 0.32864 2.237 0.025346 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.963 on 4561 degrees of freedom
Multiple R-squared: 0.3397, Adjusted R-squared: 0.3384
F-statistic: 260.7 on 9 and 4561 DF, p-value: < 2.2e-16
Example 3.8 Munich Rent Index — Interaction Between Living Area and
Location
The model to adjust ius defined as:
\[
\mathbb{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot
\text{tlocation} + \beta_{2} \cdot \overline{\frac{1}{\text{area}}} +
\beta{3} \cdot \text{area} \cdot \text{tlocation}
\]
We use the dataset for average or top location only and change the
dummy variable \(\text{tlocation}\) to
\(0\) (average) and \(1\) (top). Finally, we visualize the effect
as described in the book.
par(mfrow = c(1, 2))
at_rent <- subset(munich_rent_index, location == 1 | location == 3)
at_rent$tlocation[at_rent$location == 1] <- 0
summary(at_rent)
rent rentsqm area yearc location
Min. : 54.92 Min. : 1.417 Min. : 20.00 Min. :1918 1:1794
1st Qu.: 314.62 1st Qu.: 5.276 1st Qu.: 50.00 1st Qu.:1954 2: 0
Median : 418.00 Median : 6.849 Median : 65.00 Median :1962 3: 78
Mean : 444.22 Mean : 7.007 Mean : 66.02 Mean :1959
3rd Qu.: 537.42 3rd Qu.: 8.652 3rd Qu.: 80.00 3rd Qu.:1972
Max. :1459.23 Max. :15.428 Max. :155.00 Max. :1997
bath kitchen cheating district yearco
Mode :logical Mode :logical Mode :logical 623 : 53 Min. :18.00
FALSE:1761 FALSE:1785 FALSE:148 1013 : 37 1st Qu.:54.00
TRUE :111 TRUE :87 TRUE :1724 713 : 32 Median :62.00
1132 : 31 Mean :59.36
1712 : 30 3rd Qu.:72.00
2024 : 30 Max. :97.00
(Other):1659
areainv yearcc yearcc2
Min. :-0.0105206 Min. :-0.030935 Min. :-0.0183373
1st Qu.:-0.0044722 1st Qu.:-0.001862 1st Qu.:-0.0147283
Median :-0.0015876 Median : 0.004598 Median :-0.0067317
Mean : 0.0002152 Mean : 0.002465 Mean :-0.0008428
3rd Qu.: 0.0030278 3rd Qu.: 0.012674 3rd Qu.: 0.0116794
Max. : 0.0330278 Max. : 0.032863 Max. : 0.0537875
yearcc3 glocation tlocation
Min. :-0.0186291 Min. :-1.0000 Min. :0.00000
1st Qu.:-0.0172309 1st Qu.:-1.0000 1st Qu.:0.00000
Median :-0.0056305 Median :-1.0000 Median :0.00000
Mean :-0.0004071 Mean :-0.9583 Mean :0.04167
3rd Qu.: 0.0096124 3rd Qu.:-1.0000 3rd Qu.:0.00000
Max. : 0.0588009 Max. : 0.0000 Max. :1.00000
area_location_model <- lm(
rentsqm ~ tlocation + areainv + area:tlocation,
data = at_rent
)
summary(area_location_model)
Call:
lm(formula = rentsqm ~ tlocation + areainv + area:tlocation,
data = at_rent)
Residuals:
Min 1Q Median 3Q Max
-7.2380 -1.4435 -0.1189 1.4788 7.1162
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.911e+00 4.967e-02 139.131 <2e-16 ***
tlocation 7.722e-01 7.280e-01 1.061 0.289
areainv 1.431e+02 7.434e+00 19.252 <2e-16 ***
tlocation:area 1.001e-02 8.612e-03 1.163 0.245
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.101 on 1868 degrees of freedom
Multiple R-squared: 0.1775, Adjusted R-squared: 0.1762
F-statistic: 134.4 on 3 and 1868 DF, p-value: < 2.2e-16
beta_al <- area_location_model$coefficients
f1_area <- function(area){
areainvc <- (1 / area) - mean(1 / area)
beta_al[3] * areainvc
}
f2_area <- function(area){
beta_al[4] * area
}
small_effect <- f1_area(area_seq)
big_effect <- beta_al[2] + f1_area(area_seq) + f2_area(area_seq)
ylim <- c(min(c(small_effect, big_effect)), max(c(small_effect, big_effect)))
plot(
area_seq,
small_effect,
main = 'area effects based on a linear interaction',
xlab = 'area in sqm',
ylab = 'area effects',
ylim = ylim,
type = 'l',
col = 'darkblue',
lwd = 2
)
lines(area_seq, big_effect, col = 'red', lwd = 2)
plot(
area_seq,
beta_al[2] + f2_area(area_seq),
main = 'varying effect of top location',
xlab = 'area in sqm',
ylab = 'Varying effect of top location',
type = 'l',
col = 'darkblue',
lwd = 2
)

Example 3.9 Orthogonal Design
Just for exemplify, we define the orthogonal process describe as an R
function:
\[
\tilde{x}^{j} = x^{j} - \widetilde{X}_{j} \left (
{\widetilde{X}_{j}}^{T} \widetilde{X}_{j} \right )^{-1}
{\widetilde{X}_{j}}^{T} x^{j}
\]
orthogonal_design <- function(x){
x_out <- cbind(x[, 1] - mean(x[, 1]), matrix(0, nrow = nrow(x), ncol = ncol(x) - 1))
for (i in 2:ncol(x)){
x_hat <- as.matrix(x_out[, 1:(i-1)])
x_inv <- solve(t(x_hat) %*% x_hat)
x_out[, i] <- x[, i] - x_hat %*% x_inv %*% t(x_hat) %*% x[, i]
}
x_out
}
We test this function on the polynomial of the \(\text{yearc}\) variable:
poly_test <- poly(munich_rent_index$yearc, 3, raw = TRUE)
orthogonal_test <- orthogonal_design(poly_test)
# And check it
c(
orthogonal_test[, 1] %*% orthogonal_test[, 2],
orthogonal_test[, 2] %*% orthogonal_test[, 3],
orthogonal_test[, 1] %*% orthogonal_test[, 3]
)
[1] 7.550556e-05 8.482947e+05 2.602290e+00
Why did it not result?
Example 3.10 Munich Rent Index — Comparison of Models Using \({R}^{2}\)
We compare the models: \(M1\), \(M2\), \(M3\), and \(M4\) using its coefficient of determination
\(R^{2}\)
coeff_deter <- data.frame(
`R Squared` = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared, summary(m4)$r.squared),
row.names = c("M1", "M2", "M3", "M4")
)
coeff_deter
Example 3.11 Simple Linear Regression
For the model:
\[
y_{i} = \beta x_{i} + \epsilon_{i}
\]
We can verify:
\[
\left ( {X_{n}}^{T} X_{n} \right )^{-1} \to 0
\]
And:
\[
\max_{i=1,...,n} {x_{i}}^{T} \left ( {X_{n}}^{T} X_{n} \right )^{-1}
x_{i} \to 0 \text{ for } n \to \infty
\]
\(x_{i} = i\):
\[
\left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( (1, 2, 3, \cdots, n)
\left ( \begin{matrix}
1 \\
2 \\
\vdots \\
n
\end{matrix} \right ) \right )^{-1} = \left ( 1 + 4 + \cdots + n^2
\right )^{-1} \to 0
\]
And:
\[
\max_{i=1,...,n} ( 1, 2, \cdots, n ) \left ( 1 + 4 + \cdots + n^2 \right
)^{-1} \left ( \begin{matrix}
1 \\
2 \\
\vdots \\
n
\end{matrix} \right ) \to 0 \text{ for } n \to \infty
\]
\(x_{i} = \frac{1}{i}\):
\[
\left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( \left (1, 0.5, 0.33,
\cdots, \frac{1}{n} \right ) \left ( \begin{matrix}
1 \\
0.5 \\
\vdots \\
\frac{1}{n}
\end{matrix} \right ) \right )^{-1} = \left ( \sum_{i=1}^{n}
\frac{1}{i^{2}} \right )^{-1} \not{\to} 0
\]
And:
\[
\max_{i=1,...,n} \left ( 1, 0.5, \cdots, \frac{1}{n} \right ) \left (
\sum_{i = 1}^{n} \frac{1}{i^{2}} \right )^{-1} \left ( \begin{matrix}
1 \\
0.5 \\
\vdots \\
\frac{1}{n}
\end{matrix} \right ) \not{\to} 0 \text{ for } n \to \infty
\]
\(x_{i} =
\frac{1}{\sqrt{i}}\):
\[
\left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( \sum_{i = 1}^{n}
\frac{1}{i} \right )^{-1} \to 0
\]
And:
\[
\max_{i=1,...,n} \left ( 1, \frac{1}{\sqrt{2}}, \cdots,
\frac{1}{\sqrt{n}} \right ) \left ( \sum_{i = 1}^{n} \frac{1}{i} \right
)^{-1} \begin{matrix}
1 \\
\frac{1}{\sqrt{2}} \\
\vdots \\
\frac{1}{\sqrt{n}}
\end{matrix} \to 0
\]
Example 3.12 Munich Rent Index — Hypothesis Testing
The regression model to adjust is:
\[
\begin{matrix}
\text{rentsqm}_{i} = & \beta_{0} + \beta_{1} \text{areainvc}_{i} +
\beta_{2} \text{yearco}_{i} + \beta_{3} \text{yearco2}_{i} + \beta_{4}
\text{yearco3}_{i} \\
& + \beta_{5} \text{nkitchen}_{i} + \beta_{6} \text{pkitchen}_{i} +
\beta_{7} \text{year01}_{i} + \epsilon_{i}
\end{matrix}
\]
hypothesis_model <- lm(
rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 + nkitchen + pkitchen + year01,
data = munich_rent_index_01
)
summary(hypothesis_model)
Call:
lm(formula = rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 +
nkitchen + pkitchen + year01, data = munich_rent_index_01)
Residuals:
Min 1Q Median 3Q Max
-7.674 -1.270 -0.113 1.242 10.479
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.93239 0.03757 184.539 < 2e-16 ***
areainvc 123.76987 4.42908 27.945 < 2e-16 ***
yearco 49.37274 2.02573 24.373 < 2e-16 ***
yearco2 29.49985 2.02515 14.567 < 2e-16 ***
yearco3 -0.88418 1.97229 -0.448 0.65396
nkitchen 1.04340 0.10151 10.279 < 2e-16 ***
pkitchen 1.30205 0.15226 8.552 < 2e-16 ***
year01 -0.18524 0.06213 -2.981 0.00288 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.964 on 4563 degrees of freedom
Multiple R-squared: 0.3385, Adjusted R-squared: 0.3375
F-statistic: 333.6 on 7 and 4563 DF, p-value: < 2.2e-16
So we want to test the hypothesis:
\[
\begin{matrix}
H_{0} : \beta_{7} = 0 & \text{against} & H_{1} : \beta_{7} \neq
0
\end{matrix}
\]
About the significance of the follow-up measure.
\[
\begin{matrix}
H_{0} : \left ( \begin{matrix}
\beta_{5} \\
\beta_{6}
\end{matrix} \right ) = \left ( \begin{matrix}
0 \\
0
\end{matrix} \right ) & \text{against} & H_{1} : \left (
\begin{matrix}
\beta_{5} \\
\beta_{6}
\end{matrix} \right ) \neq \left ( \begin{matrix}
0 \\
0
\end{matrix} \right )
\end{matrix}
\]
About the significance of the kitchen variable. And a final test
about the significance of differentiate between these two type of
kitchens:
\[
\begin{matrix}
H_{0} : \beta_{5} - \beta_{6} = 0 & \text{against} & H_{1} :
\beta_{5} - \beta_{6} \neq 0
\end{matrix}
\]
Example 3.13 Munich Rent Index — Standard Output and Hypothesis
Tests
For the first test (\(H_{0} : \beta_{7} =
0\)), we calculate:
\[
t_{7} =
\frac{\hat{\beta}_{7}}{{\widehat{\text{Var}(\hat{\beta}_{7})}}^{1/2}}
\sim t_{1 - \alpha / 2} (n - p)
\]
The covariance matrix is:
cov_matrix <- vcov(hypothesis_model)
cov_matrix
(Intercept) areainvc yearco yearco2 yearco3
(Intercept) 0.001411211 0.006823735 0.004991278 0.005173670 -0.002333528
areainvc 0.006823735 19.616785356 -1.294964483 1.420604414 0.122987111
yearco 0.004991278 -1.294964483 4.103591466 0.046451741 -0.006404590
yearco2 0.005173670 1.420604414 0.046451741 4.101229176 0.027782090
yearco3 -0.002333528 0.122987111 -0.006404590 0.027782090 3.889943976
nkitchen -0.001093652 -0.091944737 -0.025946481 -0.026748293 0.006345756
pkitchen -0.001128930 0.022989833 -0.042249319 -0.049085309 -0.016014007
year01 -0.001270640 0.004137400 -0.002253660 -0.001730023 0.007205398
nkitchen pkitchen year01
(Intercept) -1.093652e-03 -0.0011289298 -1.270640e-03
areainvc -9.194474e-02 0.0229898326 4.137400e-03
yearco -2.594648e-02 -0.0422493193 -2.253660e-03
yearco2 -2.674829e-02 -0.0490853093 -1.730023e-03
yearco3 6.345756e-03 -0.0160140070 7.205398e-03
nkitchen 1.030419e-02 0.0014042193 5.682974e-05
pkitchen 1.404219e-03 0.0231824417 1.902243e-04
year01 5.682974e-05 0.0001902243 3.860040e-03
We check the \(\text{Var}(\hat{\beta}_{7})\) is the last
value.
var_7 <- cov_matrix[8, 8]
t_7 <- hypothesis_model$coefficients[8] / sqrt(var_7)
t_7
year01
-2.981496
And this coincides with the value given in the
summary.
For the second test, we compute the \(F\) value as:
\[
F = \frac{1}{r} {\hat{\beta}}^{T} \widehat{\text{Cov} \left (
\hat{\beta} \right )}^{-1} \hat{\beta} \sim F_{1, n - p}
\]
With the \(\hat{\beta}\) define in
the test \(\left ( \begin{matrix}
\hat{\beta_{5}} \\
\hat{\beta_{6}}
\end{matrix} \right )\), then \(r=2\) and \(n - p
= n - 8\):
df <- nrow(munich_rent_index_01) - 8
r <- 2
beta_hat <- as.matrix(hypothesis_model$coefficients[6:7])
f_56 <- (1/2) * t(beta_hat) %*% solve(cov_matrix[6:7, 6:7]) %*% beta_hat
f_56
[,1]
[1,] 82.08307
We get expected \(F\):
f_crit <- qf(p = 0.95, r, df)
f_crit
[1] 2.9977
And we reject the null hypothesis.
The third and final test comparing \(\beta_{5}\) and \(\beta_{6}\) between each other. We
need:
\[
\widehat{\text{Var}(\hat{\beta}_{5} - \hat{\beta}_{6})} =
\widehat{\text{Var}(\hat{\beta}_{5})} +
\widehat{\text{Var}(\hat{\beta}_{6})} - 2 \cdot
\widehat{\text{Cov}(\hat{\beta}_{5}, \hat{\beta}_{6})}
\]
Using the \(\widehat{\text{Cov}(\hat{\beta})}\)
matrix:
cov_5_6 <- cov_matrix[6, 7]
var_5 <- cov_matrix[6, 6]
var_6 <- cov_matrix[7, 7]
var_5_6 <- var_5 + var_6 - 2 * cov_5_6
f_5_6 <- (hypothesis_model$coefficients[6] - hypothesis_model$coefficients[7])^2 / var_5_6
f_5_6
nkitchen
2.18071
And this \(F\) critical, using \(r=1\):
f_crit <- qf(p = 0.95, 1, df)
f_crit
[1] 3.843498
So, in this case, we do not reject the null hypothesis.
Example 3.14 Munich Rent Index — Confidence Intervals
The confidence interval for \(\beta_{7}\) is obtain using:
\[
\beta_{7} \pm t_{n - p} \left (1 - \alpha / 2 \right ) \cdot
\widehat{\text{Var}(\hat{\beta}_{7})}^{1/2}
\]
inter <- qt(1 - 0.05 / 2, df) * sqrt(var_7)
c(hypothesis_model$coefficients[8] - inter, hypothesis_model$coefficients[8] + inter)
year01 year01
-0.3070414 -0.0634347
Example 3.15 Munich Rent Index — Prediction Intervals
Using the prediction interval:
\[
{x_{0}}^{T} \cdot \hat{\beta} \pm t_{n-p}(1 - \alpha / 2) \hat{\sigma}
\left ( 1 + {x_{0}}^{T} \left (X^{T} X \right )^{1} x_{0} \right )^{1/2}
\]
We can obtain the \(\hat{\sigma}\)
directly from the model or using:
\[
(n - p) \hat{\sigma}^2 = \epsilon^{T} \epsilon
\]
sigma_model <- summary(hypothesis_model)$sigma
sigma_2 <- t(as.matrix(hypothesis_model$residuals)) %*% as.matrix(hypothesis_model$residuals) / (nrow(munich_rent_index_01) - 8)
sigma <- sqrt(sigma_2)
c(sigma_model, sigma)
[1] 1.964112 1.964112
We are also going to use the design matrix:
design_matrix <- as.matrix(hypothesis_model$model)
design_matrix[, 1] <- 1
colnames(design_matrix)[1] <- "1"
head(design_matrix)
1 areainvc yearco yearco2 yearco3 nkitchen pkitchen
1 1 0.0116672025 -0.011568466 -0.010460264 0.030118199 0 0
2 1 -0.0072887975 -0.011568466 -0.010460264 0.030118199 0 0
3 1 0.0175786025 0.009594692 -0.004320479 -0.013940551 0 0
4 1 0.0087368025 0.010256041 -0.003177207 -0.014569233 0 0
5 1 -0.0065948975 0.018853574 0.016932467 -0.005009151 0 0
6 1 -0.0007751975 0.003642554 -0.012015191 -0.002877678 0 0
year01
1 0
2 0
3 0
4 0
5 0
6 0
area_seq <- seq(min(munich_rent_index_01$area), max(munich_rent_index_01$area), length.out = 100)
areainvc <- (1 / area_seq) - mean(1 / area_seq)
betam <- hypothesis_model$coefficients
yearcc <- munich_rent_index_01[munich_rent_index_01$yearc == 1918, ][1, ]
# Three last terms were added for completeness
constant <- betam[1] + betam[3] * yearcc$yearco + betam[4] * yearcc$yearco2 + betam[5] * yearcc$yearco3 + betam[6] * 0 + betam[7] * 0 + betam[8] * 0
plot(
munich_rent_index_01$area,
munich_rent_index_01$rentsqm_euro,
xlab = 'area in sqm',
ylab = 'estimated rent per sqm'
)
beta_1 <- hypothesis_model$coefficients[2]
lines(area_seq, constant + beta_1 * areainvc, col = 'red', lwd = 2)
conf_inter <- qt(1 - 0.05 / 2, df) * sqrt(cov_matrix[2, 2])
beta_conf <- c(beta_1 - conf_inter, beta_1 + conf_inter)
lines(area_seq, constant + beta_conf[1] * areainvc, col = 'darkblue', lwd = 2)
lines(area_seq, constant + beta_conf[2] * areainvc, col = 'darkblue', lwd = 2)
x_o <- as.matrix(cbind(
1,
areainvc,
yearcc$yearco,
yearcc$yearco2,
yearcc$yearco3,
0,
0,
0
))
pred_inter <- NULL
for (i in seq_len(nrow(x_o)))
x_o_i <- as.matrix(x_o[i, ])
pred_inter <- c(pred_inter, qt(1 - 0.05 / 2, df) * sigma * sqrt(
1 + (
t(x_o_i) %*% solve(t(design_matrix)%*%design_matrix) %*% x_o_i
))
)
lines(area_seq, constant + beta_conf[1] * areainvc - pred_inter, col = 'gray', lwd = 2)
lines(area_seq, constant + beta_conf[2] * areainvc + pred_inter, col = 'gray', lwd = 2)

This can also be done using predict:
x_o <- as.data.frame(cbind(
areainvc,
yearcc$yearco,
yearcc$yearco2,
yearcc$yearco3,
0,
0,
0
))
colnames(x_o) <- colnames(hypothesis_model$model)[2:8]
confidence_interval <- predict(hypothesis_model, x_o, interval = "confidence")
plot(
munich_rent_index_01$area,
munich_rent_index_01$rentsqm_euro,
xlab = 'area in sqm',
ylab = 'estimated rent per sqm'
)
lines(area_seq, confidence_interval[, "fit"], col = 'red', lwd = 2)
lines(area_seq, confidence_interval[, "lwr"], col = 'darkblue', lwd = 2)
lines(area_seq, confidence_interval[, "upr"], col = 'darkblue', lwd = 2)
prediction_interval <- predict(hypothesis_model, x_o, interval = "prediction")
lines(area_seq, prediction_interval[, "lwr"], col = 'gray', lwd = 2)
lines(area_seq, prediction_interval[, "upr"], col = 'gray', lwd = 2)

Example 3.16 Polynomial Regression
Simulated data from the real model:
\[
y_{i} = -1 + 0.3 x_{i} + 0.4 {x_{i}}^{2} - 0.8 {x_{i}}^{3} +
\epsilon_{i} \text{ with } \epsilon_{i} \sim N(0, {0.07}^{2})
\]
par(mfrow = c(3, 2))
x_seq <- seq(0, 1, length.out = 50)
training_data <- data.frame(
x = x_seq,
y = -1 + 0.3 * x_seq + 0.4 * x_seq ^ 2 - 0.8 * x_seq ^ 3 + rnorm(50, 0, 0.07)
)
validation_data <- data.frame(
x = x_seq,
y = -1 + 0.3 * x_seq + 0.4 * x_seq ^ 2 - 0.8 * x_seq ^ 3 + rnorm(50, 0, 0.07)
)
plot_training_data <- function(title) {
plot(
training_data,
main = title,
xlab = 'x',
ylab = 'y'
)
}
plot_training_data('training_data')
plot(
validation_data,
main = 'validation_data',
xlab = 'x',
ylab = 'y'
)
adjust_polynomial <- function(l) {
model <- "y ~ x"
if (l != 1) {
for (i in 2:l){
model <- paste0(model, " + I(x^", i, ")")
}
}
lm(model, data = training_data)
}
polynomial_models <- lapply(1:9, adjust_polynomial)
plot_training_data('regression line')
lines(x_seq, predict(polynomial_models[1][[1]]), col = 'red', lwd = 2)
plot_training_data('polynomial regression with l=2')
lines(x_seq, predict(polynomial_models[2][[1]]), col = 'red', lwd = 2)
plot_training_data('polynomial regression with l=5')
lines(x_seq, predict(polynomial_models[5][[1]]), col = 'red', lwd = 2)
mean_square_error <- function(model, data){
(1 / 50) * sum((data$y - predict(model, new_data = data)) ^ 2)
}
mse_training_data <- unlist(lapply(polynomial_models, mean_square_error, data = training_data))
mse_validation_data <- unlist(lapply(polynomial_models, mean_square_error, data = validation_data))
plot(
seq(1, 9, 1),
mse_training_data,
ylim = c(min(mse_training_data, mse_validation_data), max(mse_training_data, mse_validation_data)),
main = 'MSE for training and validation data',
xlab = 'degree of polynomial',
ylab = 'MSE',
type = 'l',
col = 'darkblue',
lwd = 2
)
lines(seq(1, 9, 1), mse_validation_data, col = 'red', lwd = 2)

Example 3.17 Correlated Covariates
Simulate the data as following:
\[
x_{1} \sim U(0, 1) \wedge x_{3} \sim U(0, 1)
\]
\[
x_{2} = x_{1} + u \text{ with } u \sim U(0, 1)
\]
\[
y \sim N(-1 + 0.3 x_{1} + 0.2 x_{3}, {0.2}^{2})
\]
n <- 150
x1 <- runif(n, 0, 1)
x3 <- runif(n, 0, 1)
u <- runif(n, 0, 1)
x2 <- x1 + u
y <- rnorm(n, mean = -1 + 0.3 * x1 + 0.2 * x3, 0.2)
corr_cov_data <- data.frame(
y = y,
x1 = x1,
x2 = x2,
x3 = x3
)
pairs(corr_cov_data)

Adjust the model with \(x_{1}\),
\(x_{2}\) and \(x_{3}\):
wrong_model <- lm(y ~ x1 + x2 + x3, data = corr_cov_data)
summary(wrong_model)
Call:
lm(formula = y ~ x1 + x2 + x3, data = corr_cov_data)
Residuals:
Min 1Q Median 3Q Max
-0.45604 -0.12357 0.00926 0.11418 0.51568
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.983404 0.049308 -19.944 < 2e-16 ***
x1 0.308749 0.077842 3.966 0.000114 ***
x2 -0.003877 0.054825 -0.071 0.943728
x3 0.138695 0.051768 2.679 0.008228 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1858 on 146 degrees of freedom
Multiple R-squared: 0.2057, Adjusted R-squared: 0.1894
F-statistic: 12.6 on 3 and 146 DF, p-value: 2.252e-07
Adjust the model with \(x_{1}\) and
\(x_{3}\):
correct_model <- lm(y ~ x1 + x3, data = corr_cov_data)
summary(correct_model)
Call:
lm(formula = y ~ x1 + x3, data = corr_cov_data)
Residuals:
Min 1Q Median 3Q Max
-0.45722 -0.12438 0.00888 0.11348 0.51427
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.98526 0.04160 -23.685 < 2e-16 ***
x1 0.30469 0.05242 5.812 3.69e-08 ***
x3 0.13868 0.05159 2.688 0.00802 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1852 on 147 degrees of freedom
Multiple R-squared: 0.2057, Adjusted R-squared: 0.1949
F-statistic: 19.03 on 2 and 147 DF, p-value: 4.465e-08
Example 3.18 Polynomial Regression — Model Choice with AIC
We calculate AIC using:
\[
AIC = n \cdot \log(\hat{\sigma}^{2}) + 2 \left ( \left | M \right | + 1
\right )
\]
aic <- function(model){
m <- ncol(model$model) - 1
n <- nrow(model$model)
n * log(summary(model)$sigma^2) + 2 * (m + 1)
}
aic_values <- unlist(lapply(polynomial_models, aic))
plot(
aic_values,
main = 'AIC as a function of degree of polynomials',
xlab = 'degrees of polynomial',
ylab = 'aic',
type = 'l',
col = 'darkblue',
lwd = 2
)

As always, this can also be done using R:
aic <- function(model){
AIC(model)
}
aic_values <- unlist(lapply(polynomial_models, aic))
plot(
aic_values,
main = 'AIC as a function of degree of polynomials',
xlab = 'degrees of polynomial',
ylab = 'aic',
type = 'l',
col = 'darkblue',
lwd = 2
)

Example 3.19 Prices of Used Cars — Model Choice
As for all dataset, we import the “prices of used cars”
dataset from the official page of the
dataset:
cars_url <- "https://www.uni-goettingen.de/de/document/download/062cadfda1c4c295f9460b49c7f5799e.raw/golffull.raw"
used_cars <- read.table(
url(cars_url),
header = 1,
colClasses = c(
"numeric", "integer", "numeric",
"integer", "factor", "factor",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric"
)
)
# Convert to logical
used_cars$extras1 <- used_cars$extras1 == 1
used_cars$extras2 <- used_cars$extras2 == 1
# Recalculate polynomials
poly_kilometer <- poly(used_cars$kilometer, 3)
used_cars[, c("kilometerop1", "kilometerop2", "kilometerop3")] <- poly_kilometer
poly_age <- poly(used_cars$age, 3)
used_cars[, c("ageop1", "ageop2", "ageop3")] <- poly_age
summary(used_cars)
price age kilometer TIA
Min. :1.450 Min. : 65.00 Min. : 10.0 Min. : 0.00
1st Qu.:2.450 1st Qu.: 99.75 1st Qu.:107.0 1st Qu.:11.00
Median :3.100 Median :115.00 Median :130.8 Median :19.00
Mean :3.397 Mean :113.19 Mean :134.7 Mean :16.98
3rd Qu.:3.960 3rd Qu.:129.00 3rd Qu.:167.5 3rd Qu.:24.00
Max. :7.300 Max. :142.00 Max. :250.0 Max. :26.00
extras1 extras2 kilometerop1 kilometerop2
Mode :logical Mode :logical Min. :-0.213325 Min. :-0.05651
FALSE:54 FALSE:43 1st Qu.:-0.047433 1st Qu.:-0.04987
TRUE :118 TRUE :129 Median :-0.006815 Median :-0.03005
Mean : 0.000000 Mean : 0.00000
3rd Qu.: 0.056036 3rd Qu.: 0.02175
Max. : 0.197130 Max. : 0.40437
kilometerop3 ageop1 ageop2 ageop3
Min. :-0.532904 Min. :-0.193852 Min. :-0.07929 Min. :-0.43373
1st Qu.:-0.047845 1st Qu.:-0.054070 1st Qu.:-0.06335 1st Qu.:-0.05421
Median : 0.009036 Median : 0.007273 Median :-0.02235 Median :-0.01263
Mean : 0.000000 Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
3rd Qu.: 0.045786 3rd Qu.: 0.063588 3rd Qu.: 0.04385 3rd Qu.: 0.06266
Max. : 0.342683 Max. : 0.115881 Max. : 0.32072 Max. : 0.18010
Exploring the data:
par(mfrow = c(3, 2))
plot(
used_cars$age,
used_cars$price,
main = 'Scatter plot: sales price versus age in months',
xlab = 'age in months',
ylab = 'sales prince in 1000 Euro'
)
plot(
used_cars$kilometer,
used_cars$price,
main = 'Scatter plot: sales price versus kilometer reading',
xlab = 'kilometer reading in 1000 km',
ylab = 'sales prince in 1000 Euro'
)
plot(
used_cars$TIA,
used_cars$price,
main = 'Scatter plot: sales price versus months until TIA',
xlab = 'months until next TIA appointment',
ylab = 'sales prince in 1000 Euro'
)
boxplot(
price ~ extras1,
data = used_cars,
main = 'Box plot: sales prince with or without ABS',
xlab = '',
ylab = 'sales prices in 1000 Euro',
names = c('no ABS', 'ABS')
)
boxplot(
price ~ extras2,
data = used_cars,
main = 'Box plot: sales prince with or without sunroof',
xlab = '',
ylab = 'sales prices in 1000 Euro',
names = c('no sunroof', 'sunroof')
)

We explore the models suggested:
\(M1\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3}
\end{matrix}
\]
With \(\text{kilometerop}i\) the
\(i\) term of a polynomial of degree 3
of the \(\text{kilometer}\) variable.
Equivalent to \(\text{ageop}i\).
\(M2\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{extras1}
\end{matrix}
\]
\(M3\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{extras2}
\end{matrix}
\]
\(M4\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{TIA}
\end{matrix}
\]
\(M5\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{extras1} + \beta_{8} \text{extras2}
\end{matrix}
\]
\(M6\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{extras1} + \beta_{8} \text{TIA}
\end{matrix}
\]
\(M7\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{extras2} + \beta_{8} \text{TIA}
\end{matrix}
\]
\(M8\)
\[
\begin{matrix}
\mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1}
\text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3}
\text{kilometerop3} \\
& + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6}
\text{ageop3} \\
& + \beta_{7} \text{extras1} + \beta_{8} \text{extras2} +
\beta_{9} \text{TIA}
\end{matrix}
\]
base_car_model <- "price ~ kilometerop1 + kilometerop2 + kilometerop3 + ageop1 + ageop2 + ageop3"
used_cars_model <- list(lm(base_car_model, data = used_cars))
extra_var <- c("extras1", "extras2", "TIA")
for (extra in extra_var){
new_model <- paste0(base_car_model, " + ", extra)
used_cars_model[[length(used_cars_model) + 1]] <- lm(new_model, data = used_cars)
}
for (tuple in combn(extra_var, 2, simplify = FALSE)){
new_model <- paste0(base_car_model, " + ", tuple[1], " + ", tuple[2])
used_cars_model[[length(used_cars_model) + 1]] <- lm(new_model, data = used_cars)
}
all_extras <- paste(extra_var, collapse = " + ")
used_cars_model[[length(used_cars_model) + 1]] <- lm(paste0(base_car_model, " + ", all_extras), data = used_cars)
cars_aic <- data.frame(model = seq(1, 8), aic = unlist(lapply(used_cars_model, AIC)))
cars_aic
9th model are obtained using “leaps and bounds” algorithm,
already implemented on R:
library(leaps)
leaps_model <- regsubsets(price ~ . -age -kilometer, data = used_cars, nvmax = 4)
summary(leaps_model)
Subset selection object
Call: regsubsets.formula(price ~ . - age - kilometer, data = used_cars,
nvmax = 4)
9 Variables (and intercept)
Forced in Forced out
TIA FALSE FALSE
extras1TRUE FALSE FALSE
extras2TRUE FALSE FALSE
kilometerop1 FALSE FALSE
kilometerop2 FALSE FALSE
kilometerop3 FALSE FALSE
ageop1 FALSE FALSE
ageop2 FALSE FALSE
ageop3 FALSE FALSE
1 subsets of each size up to 4
Selection Algorithm: exhaustive
TIA extras1TRUE extras2TRUE kilometerop1 kilometerop2 kilometerop3
1 ( 1 ) " " " " " " " " " " " "
2 ( 1 ) " " " " " " "*" " " " "
3 ( 1 ) " " " " " " "*" " " " "
4 ( 1 ) " " " " " " "*" "*" " "
ageop1 ageop2 ageop3
1 ( 1 ) "*" " " " "
2 ( 1 ) "*" " " " "
3 ( 1 ) "*" "*" " "
4 ( 1 ) "*" "*" " "
*
*
*
*
*
*
*
*
*
*
As in the book the resulted model is:
\(M9\)
\[
\mathbb{E}(\text{price}) = \beta_{0} + \beta_{1} \text{kilometerop1} +
\beta_{2} \text{kilometerop2} + \beta_{3} \text{ageop1} + \beta_{4}
\text{ageop2}
\]
We add it to the table and plot the AIC:
selected_var <- names(which(summary(leaps_model)$outmat[4, ] == "*"))
used_cars_model[[length(used_cars_model) + 1 ]] <- lm(
paste0("price ~ ", paste(selected_var, collapse= " + ")), data = used_cars
)
cars_aic[9, ] <- c(9, AIC(used_cars_model[[9]]))
cars_aic
plot(
cars_aic,
main = 'AIC for the different models',
xlab = 'model number',
ylab = 'AIC',
xaxt = 'n'
)
axis(1, at = 1:9)

Showing the age and kilometer effects as in Example 3.5.
par(mfrow = c(1, 2))
beta_m9 <- used_cars_model[[9]]$coefficients
age_seq <- seq(min(used_cars$age), max(used_cars$age), length.out = 100)
kilometer_seq <- seq(min(used_cars$kilometer), max(used_cars$kilometer), length.out = 100)
age_effect <- function(age){
i_poly_age <- predict(poly_age, age)
beta_m9[4] * i_poly_age[, 1] + beta_m9[5] * i_poly_age[, 2]
}
kilometer_effect <- function(kilometer){
i_poly_killometer <- predict(poly_kilometer, kilometer)
beta_m9[2] * i_poly_killometer[, 1] + beta_m9[3] * i_poly_killometer[, 2]
}
residual_age <- used_cars$price - beta_m9[1] - kilometer_effect(used_cars$kilometer)
plot(
used_cars$age,
residual_age,
main = 'effect of age including partial residuals',
xlab = 'age in months',
ylab = 'effect / partial residuals'
)
lines(age_seq, age_effect(age_seq), col = 'red', lwd = 2)
residual_kilometer <- used_cars$price - beta_m9[1] - age_effect(used_cars$age)
plot(
used_cars$kilometer,
residual_age,
main = 'effect of kilometer reading including partial residuals',
xlab = 'kilometer reading in 1000km',
ylab = 'effect / partial residuals'
)
lines(kilometer_seq, kilometer_effect(kilometer_seq), col = 'red', lwd = 2)

Example 3.20 Price of Used Cars — Model Diagnosis
The studentized residuals are calculated with the equation:
\[
r_{i}^{*} = \frac{\hat{\epsilon}_{i}}{\hat{\sigma}_{(i)} \sqrt{1 -
h_{ii}}}
\]
But, you can get them using R:
par(mfrow = c(2, 2))
t_residuals <- rstudent(used_cars_model[[9]])
n <- nrow(used_cars_model[[9]]$model)
p <- ncol(used_cars_model[[9]]$model)
t_critical <- qt(1 - 0.05 / (2 * n), n - p - 1)
plot_studentized_residuals <- function(against, ag_label, short_label){
plot(
against,
t_residuals,
main = paste0('studentized residuals verus ', short_label),
xlab = ag_label,
ylab = 'studentized residuals',
ylim = c(min(c(-t_critical - 0.5, t_residuals)), max(c(t_critical + 0.5, t_residuals)))
)
abline(h = 0, col = 'red', lwd = 2)
abline(h = c(-t_critical, t_critical), col = 'gray', lwd = 2)
}
plot_studentized_residuals(used_cars_model[[9]]$fitted.values, 'estimated sales price', 'estimated sales price')
plot_studentized_residuals(used_cars$kilometer, 'kilometer reading in 1000km', 'kilometer reading')
plot_studentized_residuals(used_cars$age, 'age in months', 'age in months')
plot_studentized_residuals(used_cars$TIA, 'months to the next TIA appointment', 'months to TIA')

For the standardized residuals:
\[
r_{i} = \frac{\hat{\epsilon}_{i}}{\hat{\sigma}\sqrt{1 - h_{ii}}}
\]
But using R:
n_residuals <- rstandard(used_cars_model[[9]])
qqnorm(n_residuals, col = 'darkblue', main = '', xlab = 'quantiles of normal distribution', ylab = 'quantiles of residuals')
abline(0, 1, col = 'black', lwd = 2)

Example 3.21 Prices of Used Cars — Collinearity
Now, we want the \(VIF\):
\[
{VIF}_{j} = \frac{1}{1 - {R_{j}}^{2}}
\]
library(car)
Loading required package: carData
vif(used_cars_model[[9]])
kilometerop1 kilometerop2 ageop1 ageop2
1.179534 1.033982 1.187743 1.025773
All values \(< 10\).
Example 3.22 Prices of Used Cars—Outliers
See graph of example 3.20. Finding the outlier:
t_residuals[t_residuals > t_critical]
35
4.364175
Example 3.23 Prices of Used Cars — Influence Analysis
Getting the \(h_{ii}\) values:
\[
h_{ii} = \frac{1}{n} + {\left ( x_{i} - \overline{x} \right )}^{T} \left
( {\widetilde{X}_{j}}^{T} \widetilde{X}_{j} \right )^{-1} \left ( x_{i}
- \overline{x} \right )
\]
leverage <- hatvalues(used_cars_model[[9]])
head(leverage)
1 2 3 4 5 6
0.23265338 0.13750066 0.09221031 0.05589964 0.04240741 0.07463096
range(leverage)
[1] 0.01066563 0.23265338
h_critical <- 2 * p / n
big_leverage <- leverage > h_critical
leverage[big_leverage]
1 2 3 6 43 53 62
0.23265338 0.13750066 0.09221031 0.07463096 0.15735648 0.07461053 0.08509555
167 169 170 171 172
0.10340534 0.07523947 0.10032390 0.14468732 0.13843795
We can plot these points:
par(mfrow = c(1, 2))
plot(
used_cars$age,
used_cars$price,
main = 'Scatter plot: sales price versus age in months',
xlab = 'age in months',
ylab = 'sales prince in 1000 Euro'
)
points(used_cars$age[big_leverage], used_cars$price[big_leverage], col = 'red', pch = 16)
plot(
used_cars$kilometer,
used_cars$price,
main = 'Scatter plot: sales price versus kilometer reading',
xlab = 'kilometer reading in 1000 km',
ylab = 'sales prince in 1000 Euro'
)
points(used_cars$kilometer[big_leverage], used_cars$price[big_leverage], col = 'red', pch = 16)

And the Cook’s Distance:
\[
D_{i} = \frac{{\left ( \hat{y}_{(i)} - \hat{y} \right )}^{T} \left (
\hat{y}_{(i)} - \hat{y} \right )}{p \cdot {\hat{\sigma}}^{2}}
\]
cook <- cooks.distance(used_cars_model[[9]])
head(cook)
1 2 3 4 5 6
0.0001119627 0.0474394511 0.0566528215 0.0008099303 0.0087695413 0.0045889281
range(cook)
[1] 1.041578e-07 1.171480e-01
big_cook <- cook > 0.5
cook[big_cook]
named numeric(0)
Example 3.24 Prices of Used Cars — Alternative Models
Adjust model 9 for all observations, for \(\text{kilometer} \leq 230\) and \(\text{kilometer} \geq 50\):
model_230 <- lm(price ~ kilometerop1 + kilometerop2 + ageop1 + ageop2, data = used_cars[used_cars$kilometer <= 230, ])
model_50 <- lm(price ~ kilometerop1 + kilometerop2 + ageop1 + ageop2, data = used_cars[used_cars$kilometer >= 50, ])
new_kilometer_effect <- function(kilometer, beta){
i_poly_killometer <- predict(poly_kilometer, kilometer)
beta[2] * i_poly_killometer[, 1] + beta[3] * i_poly_killometer[, 2]
}
plot(
kilometer_seq,
kilometer_effect(kilometer_seq),
main = '',
xlab = 'kilometer reading in 1000 km',
ylab = 'effects',
type = 'l',
lwd = 2
)
lines(kilometer_seq, new_kilometer_effect(kilometer_seq, model_230$coefficients), col = 'red', lwd = 2)
lines(kilometer_seq, new_kilometer_effect(kilometer_seq, model_50$coefficients), col = 'green', lwd = 2)
legend(
"topright",
legend = c('LS with all observation', 'LS without kilometer>230', 'LS without kilometer<50'),
col = c('black', 'red', 'green'),
lwd = 2
)

---
title: "Classical Linear Model"
output: html_notebook
---

```{r}
set.seed(123)
```

## Homoscedastic Error Variances

For the visualization of homocedastic and hetercedastic variance, we simulate:

$$
y \sim N \left ( -1 + 2x, 1 \right )
$$

The funnel-shaped heterocedasticy is simulated using:

$$
y \sim N \left ( -1 + 2x, \left ( 0.1 + 0.3 \left ( x + 3 \right ) \right )^{2} \right )
$$

```{r, fig.width=10, fig.height=8, fig.align='center'}
par(mfrow = c(2, 2))

# Homecedastic variance
X <- seq(-2, 2, length.out = 300)

homecedastic_random <- function (x){
  rnorm(1, mean = -1 + 2 * x,  sd = 1)[1]
}

homo_y <- sapply(X, homecedastic_random)
homo_e <- (-1 + 2 * X) - homo_y

plot(X, homo_y, xlab='', ylab='', main='homocedastic variance')
abline(-1, 2, col = "red", lwd=2)
plot(X, homo_e, xlab='', ylab='', main='homocedastic variance errors')
abline(0, 0, col = "red", lwd=2)

heterocedastic_random <- function (x){
  rnorm(1, mean = -1 + 2 * x, sd = 0.1 + 0.3 * (x + 3))[1]
}

hetero_y <- sapply(X, heterocedastic_random)
hetero_e <- (-1 + 2 * X) - hetero_y

plot(X, hetero_y, xlab='', ylab='', main='funnel-shaped heterocedastic variance')
abline(-1, 2, col = "red", lwd=2)
plot(X, hetero_e, xlab='', ylab='', main='funnel-shaped heterocedastic variance errors')
abline(0, 0, col = "red", lwd=2)
```

## Example 3.1 Munich Rent Index—Heteroscedastic Variances

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))

munich_rent_url <- "https://www.uni-goettingen.de/de/document/download/64c29c1b1fccb142cfa8f29a942a9e05.raw/rent99.raw"

munich_rent_index <- read.table(
  url(munich_rent_url),
  header = 1,
  colClasses = c(
    "numeric", "numeric", "numeric",
    "numeric", "factor", "factor",
    "factor", "factor", "factor"
  )
)

simple_reg <- lm(rent ~ area, data = munich_rent_index)

plot(munich_rent_index$area, munich_rent_index$rent, xlab = 'area in sqm', ylab = 'net rent in Euro')
abline(simple_reg, col = "red", lwd = 2)

predicted <- predict(simple_reg)

plot(munich_rent_index$area, predicted - munich_rent_index$rent, xlab = 'area in sqm', ylab = 'residuals')
abline(0, 0, col = "red", lwd = 2)
```

## Uncorrelated Errors

Simulated autocorrelated errors with positive correlation are generated using:

$$
y_{i} = -1 + 2 x_{i} + \epsilon_{i}
$$
$$
\text{Where } \epsilon_{i} = 0.9 \epsilon_{i - 1} + \mu_{i} \wedge \mu_{i} \sim N(0, 0.5^{2})
$$

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))

n <- 100
X <- seq(-3, 3, length.out = n)
mu <- rnorm(n, mean = 0, sd = 0.5)

epsilon <- mu[1]
for(i in seq(2, n)){
  epsilon <- c(epsilon, 0.9 * epsilon[i - 1] + mu[i])
}

y_pred <- -1 + 2 * X + epsilon

plot(X, y_pred, xlab = 'x', ylab = '', main = 'observations and regression line')
abline(-1, 2, col = "red", lwd = 2)

plot(epsilon, xlab = 'i', ylab = '', main = 'time series of the errors')
abline(0, 0, col = "red", lwd = 2)
```

Simulated autocorrelated errors with negative correlation are generated using:

$$
y_{i} = -1 + 2 x_{i} + \epsilon_{i}
$$
$$
\text{Where } \epsilon_{i} = - 0.9 \epsilon_{i - 1} + \mu_{i} \wedge \mu_{i} \sim N(0, 0.5^{2})
$$

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))

n <- 100
X <- seq(-3, 3, length.out = n)
mu <- rnorm(n, mean = 0, sd = 0.5)

epsilon <- mu[1]
for(i in seq(2, n)){
  epsilon <- c(epsilon, -0.9 * epsilon[i - 1] + mu[i])
}

y_pred <- -1 + 2 * X + epsilon

plot(X, y_pred, xlab = 'x', ylab = '', main = 'observations and regression line')
abline(-1, 2, col = "red", lwd = 2)

plot(epsilon, xlab = 'i', ylab = '', main = 'time series of the errors')
abline(0, 0, col = "red", lwd = 2)
```

Mispecified model is simulated using:

$$
y_{i} = \sin(x_{i}) + x_{i} + \epsilon_{i}
$$
$$
\text{Where } \epsilon_{i} \sim N(0, 0.3^{2})
$$

```{r, fig.width=10, fig.height=8, fig.align='center'}
par(mfrow = c(2, 2))

n <- 100
X <- seq(-3, 3, length.out = n)
epsilon <- rnorm(n, mean = 0, sd = 0.3)

y <- sin(X) + X + epsilon

plot(X, y, xlab = '', ylab = '', main = 'observations and true function')
lines(X, sin(X) + X, col = "red", lwd = 2)

linear_model <- lm(y ~ X)

plot(X, y, xlab = '', ylab = '', main = 'observations and regression line')
abline(linear_model, col = "red", lwd = 2)

y_pred <- predict(linear_model, data = X)
res <- y_pred - y
plot(X, res, xlab = '', ylab = '', main = 'residuals')
abline(0, 0, col = "red", lwd = 2)
```

## Additivity of Errors

We simulate data using:

$$
y_{i} = \exp(1 + x_{i1} - x_{i2} + \epsilon_{i})
$$

With:
$$
\epsilon_{i} \sim N \left ( 0, {0.4}^{2} \right )
$$

We plot this model:

```{r, fig.width=10, fig.height=8, fig.align='center'}
par(mfrow = c(2, 2))

n <- 50

x1 <- seq(0, 3, length.out = n)
x2 <- seq(0, 3, length.out = n)
x_grid <- expand.grid(x1 = x1, x2 = x2)
epsilon <- rnorm(n^2, mean = 0, sd = 0.4)
y <- exp(1 + x_grid$x1 - x_grid$x2 + epsilon)

plot(
  x_grid$x1, y,
  main = 'scatter plot: y versus x1',
  xlab = 'x1',
  ylab = 'y'
)
plot(
  x_grid$x2, y,
  main = 'scatter plot: y versus x2',
  xlab = 'x2',
  ylab = 'y'
)

plot(
  x_grid$x1, log(y),
  main = 'scatter plot: log(y) versus x1',
  xlab = 'x1',
  ylab = 'log(y)'
)
plot(
  x_grid$x2, log(y),
  main = 'scatter plot: log(y) versus x2',
  xlab = 'x2',
  ylab = 'log(y)'
)
```

## Example 3.2 Supermarket Scanner Data

Data not available online.

## Example 3.3 Munich Rent Index — Variable Transformation

Importing data using script:

```{r}
source("import_data/munich_rent_index.R")
```

We do the regression model:

$$
\text{rentsqm}_{i} = \beta_{0} + \beta_{1} \cdot f(\text{area}_{i}) + \epsilon_{i}
$$

With both $f(\cdot)$:

$$
f(\text{area}_{i}) = \text{area}_{i}
$$

For linear model and:

$$
f(\text{area}_{i}) = \frac{1}{\text{area}_{i}}
$$

For future use we define those models:

**Model 1 ($M1$)**:

$$
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area}
$$

**Model 2 ($M2$)**:

$$
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \frac{1}{\text{area}}
$$

For nonlinear relationship between $\text{rentsqm}$ and $\text{area}$. The plotted graph below are those two models with the left panels the linear regression without the transformation and the right panels with the inverse transformation.

```{r, fig.width=10, fig.height=12, fig.align='center'}
par(mfrow = c(3, 2))

m1 <- lm(rentsqm ~ area, data = munich_rent_index)
summary(m1)

m2 <- lm(rentsqm ~ I(1 / area), data = munich_rent_index)
summary(m2)

plot_rentsqm_area <- function(title) {
  plot(
    munich_rent_index$area,
    munich_rent_index$rentsqm,
    main = title,
    xlab = 'area in sqm',
    ylab = 'rent per sqm'
  )
}

seq_area <- seq(min(munich_rent_index$area), max(munich_rent_index$area), length.out = 100)

plot_rentsqm_area('rent per sqm vs. area')
m1_beta <- m1$coefficients
lines(seq_area, m1_beta[1] + m1_beta[2] * seq_area, col = 'red', lwd = 2)

plot_rentsqm_area('f(area) = 1/area')
m2_beta <- m2$coefficients
lines(seq_area, m2_beta[1] + m2_beta[2] * 1 / seq_area, col = 'red', lwd = 2)

plot_residuals <- function(model) {
  plot(
    munich_rent_index$area,
    model$residuals,
    main = 'residuals',
    xlab = 'area in sqm',
    ylab = 'residuals'
  )
  abline(0, 0, col = 'red', lwd = 2)
}

plot_residuals(m1)
plot_residuals(m2)

plot_av_residuals <- function(model) {
  av_residuals <- tapply(
    model$residuals, munich_rent_index$area,
    mean
  )
  plot(
    unique(munich_rent_index$area),
    av_residuals,
    main = 'average residuals',
    xlab = 'area in sqm',
    ylab = 'residuals'
  )
  abline(0, 0, col = 'red', lwd = 2)
}

plot_av_residuals(m1)
plot_av_residuals(m2)
```


## Example 3.4 Munich Rent Index — Polynomial Regression

For polynomial regressions we adjust two different model. Of second-order (**Model 3 ($M3$)**):

$$
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} + \beta_{2} \text{area}^{2}
$$

And third-order (**Model 4 ($M4$)**):

$$
\mathbb{E}(\text{rentqsm}) = \beta_{0} + \beta_{1} \text{area} + \beta_{2} \text{area}^{2} + \beta_{3} \text{area}^{3}
$$

```{r, fig.width=10, fig.height=12, fig.align='center'}
par(mfrow = c(2, 2))

m3 <- lm(rentsqm ~ area + I(area^2), data = munich_rent_index)
summary(m3)

m4 <- lm(rentsqm ~ area + I(area^2) + I(area^3), data = munich_rent_index)
summary(m4)

plot_rentsqm_area('rent per sqm vs. area')
m3_beta <- m3$coefficients
lines(seq_area, m3_beta[1] + m3_beta[2] * seq_area + m3_beta[3] * seq_area ^ 2, col = 'red', lwd = 2)

plot_rentsqm_area('f(area) = 1/area')
m4_beta <- m4$coefficients
lines(seq_area, m4_beta[1] + m4_beta[2] * seq_area + m4_beta[3] * seq_area ^ 2 + m4_beta[4] * seq_area ^ 3, col = 'red', lwd = 2)

plot_residuals(m3)
plot_residuals(m4)
```

## Example 3.5 Munich Rent Index — Additive Models

We define the following aditive models. Additive model 1:

$$
\mathcal{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \frac{1}{\text{area}_{i}} + \beta_{2} \cdot \text{yearc}_{i}
$$

And a second one with $\text{yearc}$ as a polinomial of degree 3:

$$
\mathcal{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \frac{1}{\text{area}_{i}} + \beta_{2} \cdot \text{yearc}_{i} + \beta_{3} \cdot {\text{yearc}_{i}}^{2} + \beta_{4} \cdot {\text{yearc}_{i}}^{3}
$$

```{r}
additive_m1 <- lm(rentsqm ~ I(1/area) + yearc, data = munich_rent_index)
summary(additive_m1)

additive_m2 <- lm(rentsqm ~ I(1/area) + yearc + I(yearc ^ 2)+ I(yearc ^ 3), data = munich_rent_index)
summary(additive_m2)
```

A third model is specified using the truncated year of construction ($\text{yearc} - 1900$):

```{r}
munich_rent_index$yearco <- munich_rent_index$yearc - 1900
additive_m3 <- lm(rentsqm ~ I(1/area) + yearco + I(yearco ^ 2)+ I(yearco ^ 3), data = munich_rent_index)
summary(additive_m3)
```

As in the book, we show the effect of year of construction in the polynomial of degree 3:

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))
beta_add_m2 <- additive_m2$coefficients

yearc_seq <- seq(min(munich_rent_index$yearc), max(munich_rent_index$yearc), length.out = 100)

plot_effect <- function(beta, year_seq, title){
  plot(
    year_seq,
    beta[3] * year_seq + beta[4] * year_seq^2 + beta[5] * year_seq^3,
    main = title,
    xlab = 'year of construction',
    ylab = 'effect of year of construction',
    type = 'l',
    col = 'darkblue',
    lwd = 2
  )
}

plot_effect(beta_add_m2, yearc_seq, 'effect of year of construction')

beta_add_m3 <- additive_m3$coefficients
truncate_yearc_seq <- seq(min(munich_rent_index$yearco), max(munich_rent_index$yearco), length.out = 100)
plot_effect(beta_add_m3, truncate_yearc_seq, 'effect of year of construction, coded from 18 to 96')
```

A new model is defined using the orthogonal polynomial of year of construction:

```{r}
munich_rent_index$areainv <- (1 / munich_rent_index$area) - mean(1 / munich_rent_index$area)
poly_year <- poly(munich_rent_index$yearco, 3)
munich_rent_index$yearcc <- poly_year[, 1]
munich_rent_index$yearcc2 <- poly_year[, 2]
munich_rent_index$yearcc3 <- poly_year[, 3]
additive_m4 <- lm(rentsqm ~ areainv + yearcc + yearcc2 + yearcc3, data = munich_rent_index)
summary(additive_m4)
```

Now we calculate the partial residuals for $\text{area}$ and $\text{yearc}$ define by:

$$
\hat{\epsilon}_{\text{area}, i} = \text{rentsqm}_{i} - \hat{\beta_{0}} - \hat{f}_{2}(\text{yearc}_{i})
$$

With the effect $f(\text{yearc}_{i})$:

$$
\hat{f}_{2}(\text{yearc}) = \beta_{2} \cdot \text{yearc}_{i} + \beta_{3} \cdot {\text{yearc}_{i}}^{2} + \beta_{4} \cdot {\text{yearc}_{i}}^{3}
$$

And:

$$
\hat{\epsilon}_{\text{yearc}, i} = \text{rentsqm}_{i} - \hat{\beta_{0}} - \hat{f}_{1}(\text{area}_{i})
$$

With the effect $f(\text{yearc}_{i})$:

$$
\hat{f}_{1}(\text{yearc}) = \beta_{1} \cdot \frac{1}{\text{area}_{i}}
$$

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))

beta_add_m4 <- additive_m4$coefficients

area_seq <- seq(min(munich_rent_index$area), max(munich_rent_index$area), length.out = 100)

area_effect <- function(area){
  inv_area <- 1 / area
  cent_area <- inv_area - mean(inv_area)
  beta_add_m4[2] * cent_area
}

yearc_effect <- function(yearc){
  poly_yearc <- predict(poly_year, yearc - 1900)
  beta_add_m4[3] * poly_yearc[, 1] + beta_add_m4[4] * poly_yearc[, 2] + beta_add_m4[5] * poly_yearc[, 3]
}

residual_area <- munich_rent_index$rentsqm - beta_add_m4[1] - yearc_effect(munich_rent_index$yearc)
plot(
  munich_rent_index$area,
  residual_area,
  main = 'effect of area',
  xlab = 'area in sqm',
  ylab = 'effect of area / partial residuals'
)
lines(area_seq, area_effect(area_seq), col = 'red', lwd = 2)

residual_year <- munich_rent_index$rentsqm - beta_add_m4[1] - area_effect(munich_rent_index$area)
plot(
  munich_rent_index$yearc,
  residual_year,
  main = 'effect of year of construction',
  xlab = 'year of construction',
  ylab = 'effect of year of construction / partial residuals'
)
lines(yearc_seq, yearc_effect(yearc_seq), col = 'red', lwd = 2)
```

## Example 3.6 Munich Rent Index — Effect Coding

We adjust the regression model with the coding values of location:

$$
\mathbb{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \text{glocation}_{1} + \beta_{2} \cdot \text{tlocation}_{1}
$$

```{r}
munich_rent_index$glocation <- 0
munich_rent_index$glocation[munich_rent_index$location == 2] <- 1
munich_rent_index$glocation[munich_rent_index$location == 1] <- -1

munich_rent_index$tlocation <- 0
munich_rent_index$tlocation[munich_rent_index$location == 3] <- 1
munich_rent_index$tlocation[munich_rent_index$location == 1] <- -1

cod_model <- lm(
  rentsqm ~ glocation + tlocation,
  data = munich_rent_index
)
summary(cod_model)
```

## Example 3.7 Munich Rent Index — Interaction with Quality of Kitchen

We import the new updated model of 2001, available at [official page of the dataset](https://www.uni-goettingen.de/en/551625.html) as all the datasets:

```{r}
munich_rent_url_01 <- "https://www.uni-goettingen.de/de/document/download/e00d039e9c1f28ab404b27b0b14931f1.raw/rent98_00.raw"

munich_rent_index_01 <- read.table(
  url(munich_rent_url_01),
  header = 1,
  colClasses = c(
    "numeric", "numeric", "numeric",
    "numeric", "integer", "integer",
    "integer", "integer", "integer",
    "integer", "numeric", "numeric",
    "numeric"
  )
)

summary(munich_rent_index_01)
```

And we adjust the model:

$$
\begin{matrix}
  \widehat{\text{rentsqm}}_{i} = & \beta_{0} + \beta_{1} \text{areainvc}_{i} + \beta_{2} \text{yearco}_{i} + \beta_{3} \text{yearco2}_{i} + \beta_{4} \text{yearco3}_{i} \\
   & + \beta_{5} \text{year01}_{i} + \beta_{6} \text{nkitchen}_{i} + \beta_{7} \text{pkitchen}_{i} \\
   & + \beta_{8} \text{nkitchen}_{i} \cdot \text{year01}_{i} + \beta_{9} \text{pkitchen}_{i} \cdot \text{year01}_{i}
\end{matrix}
$$

```{r}
munich_rent_index_01$areainvc <- munich_rent_index_01$invarea - mean(munich_rent_index_01$invarea)
munich_rent_index_01$yearco <- munich_rent_index_01$yearc - mean(munich_rent_index_01$yearc)
poly_year_01 <- poly(munich_rent_index_01$yearco, 3)
munich_rent_index_01$yearco <- poly_year_01[, 1]
munich_rent_index_01$yearco2 <- poly_year_01[, 2]
munich_rent_index_01$yearco3 <- poly_year_01[, 3]

interaction_model <- lm(
  rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 + year01 + nkitchen + pkitchen + nkitchen * year01 + pkitchen * year01,
  data = munich_rent_index_01
)

summary(interaction_model)
```

## Example 3.8 Munich Rent Index — Interaction Between Living Area and Location

The model to adjust ius defined as:

$$
\mathbb{E}(\text{rentsqm}_{i}) = \beta_{0} + \beta_{1} \cdot \text{tlocation} + \beta_{2} \cdot \overline{\frac{1}{\text{area}}} + \beta{3} \cdot \text{area} \cdot \text{tlocation}
$$

We use the dataset for average or top location only and change the dummy variable $\text{tlocation}$ to $0$ (average) and $1$ (top). Finally, we visualize the effect as described in the book.

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))
at_rent <- subset(munich_rent_index, location == 1 | location == 3)
at_rent$tlocation[at_rent$location == 1] <- 0
summary(at_rent)

area_location_model <- lm(
  rentsqm ~ tlocation + areainv + area:tlocation,
  data = at_rent
)
summary(area_location_model)

beta_al <- area_location_model$coefficients

f1_area <- function(area){
  areainvc <- (1 / area) - mean(1 / area)
  beta_al[3] * areainvc
}

f2_area <- function(area){
  beta_al[4] * area
}

small_effect <- f1_area(area_seq)
big_effect <- beta_al[2] + f1_area(area_seq) + f2_area(area_seq)

ylim <- c(min(c(small_effect, big_effect)), max(c(small_effect, big_effect)))

plot(
  area_seq,
  small_effect,
  main = 'area effects based on a linear interaction',
  xlab = 'area in sqm',
  ylab = 'area effects',
  ylim = ylim,
  type = 'l',
  col = 'darkblue',
  lwd = 2
)
lines(area_seq, big_effect, col = 'red', lwd = 2)

plot(
  area_seq,
  beta_al[2] + f2_area(area_seq),
  main = 'varying effect of top location',
  xlab = 'area in sqm',
  ylab = 'Varying effect of top location',
  type = 'l',
  col = 'darkblue',
  lwd = 2
)
```

## Example 3.9 Orthogonal Design

Just for exemplify, we define the orthogonal process describe as an R function:

$$
\tilde{x}^{j} = x^{j} - \widetilde{X}_{j} \left ( {\widetilde{X}_{j}}^{T} \widetilde{X}_{j} \right )^{-1} {\widetilde{X}_{j}}^{T} x^{j}
$$

```{r}
orthogonal_design <- function(x){
  x_out <- cbind(x[, 1] - mean(x[, 1]), matrix(0, nrow = nrow(x), ncol = ncol(x) - 1))
  for (i in 2:ncol(x)){
    x_hat <- as.matrix(x_out[, 1:(i-1)])
    x_inv <- solve(t(x_hat) %*% x_hat)
    x_out[, i] <- x[, i] - x_hat %*% x_inv %*% t(x_hat) %*% x[, i]
  }
  x_out
}
```
We test this function on the polynomial of the $\text{yearc}$ variable:

```{r}
poly_test <- poly(munich_rent_index$yearc, 3, raw = TRUE)
orthogonal_test <- orthogonal_design(poly_test)

# And check it
c(
  orthogonal_test[, 1] %*% orthogonal_test[, 2],
  orthogonal_test[, 2] %*% orthogonal_test[, 3],
  orthogonal_test[, 1] %*% orthogonal_test[, 3]
)
```

Why did it not result?

## Example 3.10 Munich Rent Index — Comparison of Models Using ${R}^{2}$

We compare the models: $M1$, $M2$, $M3$, and $M4$ using its coefficient of determination $R^{2}$

```{r}
coeff_deter <- data.frame(
  `R Squared` = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared, summary(m4)$r.squared),
  row.names = c("M1", "M2", "M3", "M4")
)
coeff_deter
```

## Example 3.11 Simple Linear Regression

For the model:

$$
y_{i} = \beta x_{i} + \epsilon_{i}
$$

We can verify:

$$
\left ( {X_{n}}^{T} X_{n} \right )^{-1} \to 0
$$

And:

$$
\max_{i=1,...,n} {x_{i}}^{T} \left ( {X_{n}}^{T} X_{n} \right )^{-1} x_{i} \to 0 \text{ for } n \to \infty
$$

### $x_{i} = i$:

$$
\left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( (1, 2, 3, \cdots, n) \left ( \begin{matrix}
1 \\
2 \\
\vdots  \\
n
\end{matrix} \right ) \right )^{-1} = \left ( 1 + 4 + \cdots + n^2 \right )^{-1} \to 0
$$

And:

$$
\max_{i=1,...,n} ( 1, 2, \cdots, n ) \left ( 1 + 4 + \cdots + n^2 \right )^{-1} \left ( \begin{matrix}
1 \\
2 \\
\vdots  \\
n
\end{matrix} \right ) \to 0 \text{ for } n \to \infty
$$

### $x_{i} = \frac{1}{i}$:

$$
\left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( \left (1, 0.5, 0.33, \cdots, \frac{1}{n} \right ) \left ( \begin{matrix}
1 \\
0.5 \\
\vdots  \\
\frac{1}{n}
\end{matrix} \right ) \right )^{-1} = \left ( \sum_{i=1}^{n} \frac{1}{i^{2}} \right )^{-1} \not{\to} 0
$$

And:

$$
\max_{i=1,...,n} \left ( 1, 0.5, \cdots, \frac{1}{n} \right ) \left ( \sum_{i = 1}^{n} \frac{1}{i^{2}} \right )^{-1} \left ( \begin{matrix}
1 \\
0.5 \\
\vdots  \\
\frac{1}{n}
\end{matrix} \right ) \not{\to} 0 \text{ for } n \to \infty
$$

### $x_{i} = \frac{1}{\sqrt{i}}$:

$$
\left ( {X_{n}}^{T} X_{n} \right )^{-1} = \left ( \sum_{i = 1}^{n} \frac{1}{i} \right )^{-1} \to 0
$$

And:

$$
\max_{i=1,...,n} \left ( 1, \frac{1}{\sqrt{2}}, \cdots, \frac{1}{\sqrt{n}} \right ) \left ( \sum_{i = 1}^{n} \frac{1}{i} \right )^{-1} \begin{matrix}
1 \\
\frac{1}{\sqrt{2}} \\
\vdots \\
\frac{1}{\sqrt{n}}
\end{matrix} \to 0
$$

## Example 3.12 Munich Rent Index — Hypothesis Testing

The regression model to adjust is:

$$
\begin{matrix}
\text{rentsqm}_{i} = & \beta_{0} + \beta_{1} \text{areainvc}_{i} + \beta_{2} \text{yearco}_{i} + \beta_{3} \text{yearco2}_{i} + \beta_{4} \text{yearco3}_{i} \\
& + \beta_{5} \text{nkitchen}_{i} + \beta_{6} \text{pkitchen}_{i} + \beta_{7} \text{year01}_{i} + \epsilon_{i}
\end{matrix}
$$

```{r}
hypothesis_model <- lm(
  rentsqm_euro ~ areainvc + yearco + yearco2 + yearco3 + nkitchen + pkitchen + year01,
  data = munich_rent_index_01
)
summary(hypothesis_model)
```

So we want to test the hypothesis:

$$
\begin{matrix}
H_{0} : \beta_{7} = 0 & \text{against} & H_{1} : \beta_{7} \neq 0
\end{matrix}
$$

About the significance of the follow-up measure.

$$
\begin{matrix}
H_{0} : \left ( \begin{matrix}
\beta_{5} \\
\beta_{6}
\end{matrix} \right ) = \left ( \begin{matrix}
0 \\
0
\end{matrix} \right ) & \text{against} & H_{1} : \left ( \begin{matrix}
\beta_{5} \\
\beta_{6}
\end{matrix} \right ) \neq \left ( \begin{matrix}
0 \\
0
\end{matrix} \right )
\end{matrix}
$$

About the significance of the kitchen variable. And a final test about the significance of differentiate between these two type of kitchens:

$$
\begin{matrix}
H_{0} : \beta_{5} - \beta_{6} = 0 & \text{against} & H_{1} : \beta_{5} - \beta_{6} \neq 0
\end{matrix}
$$

## Example 3.13 Munich Rent Index — Standard Output and Hypothesis Tests

For the first test ($H_{0} : \beta_{7} = 0$), we calculate:

$$
t_{7} = \frac{\hat{\beta}_{7}}{{\widehat{\text{Var}(\hat{\beta}_{7})}}^{1/2}} \sim t_{1 - \alpha / 2} (n - p)
$$

The covariance matrix is:

```{r}
cov_matrix <- vcov(hypothesis_model)
cov_matrix
```

We check the $\text{Var}(\hat{\beta}_{7})$ is the last value.

```{r}
var_7 <- cov_matrix[8, 8]
t_7 <- hypothesis_model$coefficients[8] / sqrt(var_7)
t_7
```

And this coincides with the value given in the `summary`.

For the second test, we compute the $F$ value as:

$$
F = \frac{1}{r} {\hat{\beta}}^{T} \widehat{\text{Cov} \left ( \hat{\beta} \right )}^{-1} \hat{\beta} \sim F_{1, n - p}
$$

With the $\hat{\beta}$ define in the test $\left ( \begin{matrix}
\hat{\beta_{5}} \\
\hat{\beta_{6}}
\end{matrix} \right )$, then $r=2$ and $n - p = n - 8$:

```{r}
df <- nrow(munich_rent_index_01) - 8
r <- 2
beta_hat <- as.matrix(hypothesis_model$coefficients[6:7])
f_56 <- (1/2) * t(beta_hat) %*% solve(cov_matrix[6:7, 6:7]) %*% beta_hat
f_56
```

We get expected $F$:

```{r}
f_crit <- qf(p = 0.95, r, df)
f_crit
```

And we reject the null hypothesis.

The third and final test comparing $\beta_{5}$ and $\beta_{6}$ between each other. We need:

$$
\widehat{\text{Var}(\hat{\beta}_{5} - \hat{\beta}_{6})} = \widehat{\text{Var}(\hat{\beta}_{5})} + \widehat{\text{Var}(\hat{\beta}_{6})} - 2 \cdot \widehat{\text{Cov}(\hat{\beta}_{5}, \hat{\beta}_{6})}
$$

Using the $\widehat{\text{Cov}(\hat{\beta})}$ matrix:

```{r}
cov_5_6 <- cov_matrix[6, 7]
var_5 <- cov_matrix[6, 6]
var_6 <- cov_matrix[7, 7]
var_5_6 <- var_5 + var_6 - 2 * cov_5_6
f_5_6 <- (hypothesis_model$coefficients[6] - hypothesis_model$coefficients[7])^2 / var_5_6
f_5_6
```

And this $F$ critical, using $r=1$:

```{r}
f_crit <- qf(p = 0.95, 1, df)
f_crit
```

So, in this case, we do not reject the null hypothesis.

## Example 3.14 Munich Rent Index — Confidence Intervals

The confidence interval for $\beta_{7}$ is obtain using:

$$
\beta_{7} \pm t_{n - p} \left (1 - \alpha / 2 \right ) \cdot \widehat{\text{Var}(\hat{\beta}_{7})}^{1/2}
$$

```{r}
inter <- qt(1 - 0.05 / 2, df) * sqrt(var_7)
c(hypothesis_model$coefficients[8] - inter, hypothesis_model$coefficients[8] + inter)
```

## Example 3.15 Munich Rent Index — Prediction Intervals

Using the prediction interval:

$$
{x_{0}}^{T} \cdot \hat{\beta} \pm t_{n-p}(1 - \alpha / 2) \hat{\sigma} \left ( 1 + {x_{0}}^{T} \left (X^{T} X \right )^{1} x_{0} \right )^{1/2}
$$

We can obtain the $\hat{\sigma}$ directly from the model or using:

$$
(n - p) \hat{\sigma}^2 = \epsilon^{T} \epsilon
$$

```{r}
sigma_model <- summary(hypothesis_model)$sigma
sigma_2 <- t(as.matrix(hypothesis_model$residuals)) %*% as.matrix(hypothesis_model$residuals) / (nrow(munich_rent_index_01) - 8)
sigma <- sqrt(sigma_2)
c(sigma_model, sigma)
```

We are also going to use the design matrix:

```{r}
design_matrix <- as.matrix(hypothesis_model$model)
design_matrix[, 1] <- 1
colnames(design_matrix)[1] <- "1"
head(design_matrix)
```

```{r, fig.width=10, fig.height=6, fig.align='center'}
area_seq <- seq(min(munich_rent_index_01$area), max(munich_rent_index_01$area), length.out = 100)
areainvc <- (1 / area_seq) - mean(1 / area_seq)

betam <- hypothesis_model$coefficients
yearcc <- munich_rent_index_01[munich_rent_index_01$yearc == 1918, ][1, ]
# Three last terms were added for completeness
constant <- betam[1] + betam[3] * yearcc$yearco + betam[4] * yearcc$yearco2 + betam[5] * yearcc$yearco3 + betam[6] * 0 + betam[7] * 0 + betam[8] * 0

plot(
  munich_rent_index_01$area,
  munich_rent_index_01$rentsqm_euro,
  xlab = 'area in sqm',
  ylab = 'estimated rent per sqm'
)
beta_1 <- hypothesis_model$coefficients[2]
lines(area_seq, constant + beta_1 * areainvc, col = 'red', lwd = 2)

conf_inter <- qt(1 - 0.05 / 2, df) * sqrt(cov_matrix[2, 2])
beta_conf <- c(beta_1 - conf_inter, beta_1 + conf_inter)
lines(area_seq, constant + beta_conf[1] * areainvc, col = 'darkblue', lwd = 2)
lines(area_seq, constant + beta_conf[2] * areainvc, col = 'darkblue', lwd = 2)

x_o <- as.matrix(cbind(
  1,
  areainvc,
  yearcc$yearco,
  yearcc$yearco2,
  yearcc$yearco3,
  0,
  0,
  0
))

pred_inter <- NULL

for (i in seq_len(nrow(x_o)))
  x_o_i <- as.matrix(x_o[i, ])
  pred_inter <- c(pred_inter, qt(1 - 0.05 / 2, df) * sigma * sqrt(
    1 + (
      t(x_o_i) %*% solve(t(design_matrix)%*%design_matrix) %*% x_o_i
   ))
)

lines(area_seq, constant + beta_conf[1] * areainvc - pred_inter, col = 'gray', lwd = 2)
lines(area_seq, constant + beta_conf[2] * areainvc + pred_inter, col = 'gray', lwd = 2)
```

This can also be done using `predict`:

```{r, fig.width=10, fig.height=6, fig.align='center'}
x_o <- as.data.frame(cbind(
  areainvc,
  yearcc$yearco,
  yearcc$yearco2,
  yearcc$yearco3,
  0,
  0,
  0
))
colnames(x_o) <- colnames(hypothesis_model$model)[2:8]
confidence_interval <- predict(hypothesis_model, x_o, interval = "confidence")

plot(
    munich_rent_index_01$area,
    munich_rent_index_01$rentsqm_euro,
    xlab = 'area in sqm',
    ylab = 'estimated rent per sqm'
)
lines(area_seq, confidence_interval[, "fit"], col = 'red', lwd = 2)
lines(area_seq, confidence_interval[, "lwr"], col = 'darkblue', lwd = 2)
lines(area_seq, confidence_interval[, "upr"], col = 'darkblue', lwd = 2)

prediction_interval <- predict(hypothesis_model, x_o, interval = "prediction")
lines(area_seq, prediction_interval[, "lwr"], col = 'gray', lwd = 2)
lines(area_seq, prediction_interval[, "upr"], col = 'gray', lwd = 2)
```

## Example 3.16 Polynomial Regression

Simulated data from the real model:

$$
y_{i} = -1 + 0.3 x_{i} + 0.4 {x_{i}}^{2} - 0.8 {x_{i}}^{3} + \epsilon_{i} \text{ with } \epsilon_{i} \sim N(0, {0.07}^{2})
$$

```{r, fig.width=10, fig.height=12, fig.align='center'}
par(mfrow = c(3, 2))

x_seq <- seq(0, 1, length.out = 50)
training_data <- data.frame(
  x = x_seq,
  y = -1 + 0.3 * x_seq + 0.4 * x_seq ^ 2 - 0.8 * x_seq ^ 3 + rnorm(50, 0, 0.07)
)
validation_data <- data.frame(
  x = x_seq,
  y = -1 + 0.3 * x_seq + 0.4 * x_seq ^ 2 - 0.8 * x_seq ^ 3 + rnorm(50, 0, 0.07)
)

plot_training_data <- function(title) {
  plot(
    training_data,
    main = title,
    xlab = 'x',
    ylab = 'y'
  )
}

plot_training_data('training_data')
plot(
  validation_data,
  main = 'validation_data',
  xlab = 'x',
  ylab = 'y'
)

adjust_polynomial <- function(l) {
  model <- "y ~ x"
  if (l != 1) {
    for (i in 2:l){
      model <- paste0(model, " + I(x^", i, ")")
    }
  }
  lm(model, data = training_data)
}

polynomial_models <- lapply(1:9, adjust_polynomial)
plot_training_data('regression line')
lines(x_seq, predict(polynomial_models[1][[1]]), col = 'red', lwd = 2)

plot_training_data('polynomial regression with l=2')
lines(x_seq, predict(polynomial_models[2][[1]]), col = 'red', lwd = 2)

plot_training_data('polynomial regression with l=5')
lines(x_seq, predict(polynomial_models[5][[1]]), col = 'red', lwd = 2)

mean_square_error <- function(model, data){
  (1 / 50) * sum((data$y - predict(model, new_data = data)) ^ 2)
}

mse_training_data <- unlist(lapply(polynomial_models, mean_square_error, data = training_data))
mse_validation_data <- unlist(lapply(polynomial_models, mean_square_error, data = validation_data))

plot(
  seq(1, 9, 1),
  mse_training_data,
  ylim = c(min(mse_training_data, mse_validation_data), max(mse_training_data, mse_validation_data)),
  main = 'MSE for training and validation data',
  xlab = 'degree of polynomial',
  ylab = 'MSE',
  type = 'l',
  col = 'darkblue',
  lwd = 2
)
lines(seq(1, 9, 1), mse_validation_data, col = 'red', lwd = 2)
```

## Example 3.17 Correlated Covariates

Simulate the data as following:

$$
x_{1} \sim U(0, 1) \wedge x_{3} \sim U(0, 1)
$$

$$
x_{2} = x_{1} + u \text{ with } u \sim U(0, 1)
$$

$$
y \sim N(-1 + 0.3 x_{1} + 0.2 x_{3}, {0.2}^{2})
$$

```{r, fig.width=8, fig.height=8, fig.align='center'}
n <- 150

x1 <- runif(n, 0, 1)
x3 <- runif(n, 0, 1)
u <- runif(n, 0, 1)

x2 <- x1 + u

y <- rnorm(n, mean = -1 + 0.3 * x1 + 0.2 * x3, 0.2)

corr_cov_data <- data.frame(
  y = y,
  x1 = x1,
  x2 = x2,
  x3 = x3
)
pairs(corr_cov_data)
```

Adjust the model with $x_{1}$, $x_{2}$ and $x_{3}$:

```{r}
wrong_model <- lm(y ~ x1 + x2 + x3, data = corr_cov_data)
summary(wrong_model)
```

Adjust the model with $x_{1}$ and $x_{3}$:

```{r}
correct_model <- lm(y ~ x1 + x3, data = corr_cov_data)
summary(correct_model)
```

## Example 3.18 Polynomial Regression — Model Choice with AIC

We calculate AIC using:

$$
AIC = n \cdot \log(\hat{\sigma}^{2}) +  2 \left ( \left | M \right | + 1 \right )
$$

```{r, fig.width=5, fig.height=4, fig.align='center'}
aic <- function(model){
  m <- ncol(model$model) - 1
  n <- nrow(model$model)
  n * log(summary(model)$sigma^2) + 2 * (m + 1)
}

aic_values <- unlist(lapply(polynomial_models, aic))
plot(
  aic_values,
  main = 'AIC as a function of degree of polynomials',
  xlab = 'degrees of polynomial',
  ylab = 'aic',
  type = 'l',
  col = 'darkblue',
  lwd = 2
)
```

As always, this can also be done using R:

```{r, fig.width=5, fig.height=4, fig.align='center'}
aic <- function(model){
  AIC(model)
}

aic_values <- unlist(lapply(polynomial_models, aic))
plot(
        aic_values,
        main = 'AIC as a function of degree of polynomials',
        xlab = 'degrees of polynomial',
        ylab = 'aic',
        type = 'l',
        col = 'darkblue',
        lwd = 2
)
```

## Example 3.19 Prices of Used Cars — Model Choice

As for all dataset, we import the _"prices of used cars"_ dataset from the [official page of the dataset](https://www.uni-goettingen.de/en/551625.html):

```{r}
cars_url <- "https://www.uni-goettingen.de/de/document/download/062cadfda1c4c295f9460b49c7f5799e.raw/golffull.raw"

used_cars <- read.table(
  url(cars_url),
  header = 1,
  colClasses = c(
    "numeric", "integer", "numeric",
    "integer", "factor", "factor",
    "numeric", "numeric", "numeric",
    "numeric", "numeric", "numeric"
  )
)

# Convert to logical
used_cars$extras1 <- used_cars$extras1 == 1
used_cars$extras2 <- used_cars$extras2 == 1

# Recalculate polynomials
poly_kilometer <- poly(used_cars$kilometer, 3)
used_cars[, c("kilometerop1", "kilometerop2", "kilometerop3")] <- poly_kilometer
poly_age <- poly(used_cars$age, 3)
used_cars[, c("ageop1", "ageop2", "ageop3")] <- poly_age
summary(used_cars)
```

Exploring the data:

```{r, fig.width=10, fig.height=12, fig.align='center'}
par(mfrow = c(3, 2))

plot(
  used_cars$age,
  used_cars$price,
  main = 'Scatter plot: sales price versus age in months',
  xlab = 'age in months',
  ylab = 'sales prince in 1000 Euro'
)

plot(
  used_cars$kilometer,
  used_cars$price,
  main = 'Scatter plot: sales price versus kilometer reading',
  xlab = 'kilometer reading in 1000 km',
  ylab = 'sales prince in 1000 Euro'
)

plot(
  used_cars$TIA,
  used_cars$price,
  main = 'Scatter plot: sales price versus months until TIA',
  xlab = 'months until next TIA appointment',
  ylab = 'sales prince in 1000 Euro'
)

boxplot(
  price ~ extras1,
  data = used_cars,
  main = 'Box plot: sales prince with or without ABS',
  xlab = '',
  ylab = 'sales prices in 1000 Euro',
  names = c('no ABS', 'ABS')
)

boxplot(
  price ~ extras2,
  data = used_cars,
  main = 'Box plot: sales prince with or without sunroof',
  xlab = '',
  ylab = 'sales prices in 1000 Euro',
  names = c('no sunroof', 'sunroof')
)
```

We explore the models suggested:

**$M1$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3}
\end{matrix}
$$

With $\text{kilometerop}i$ the $i$ term of a polynomial of degree 3 of the $\text{kilometer}$ variable. Equivalent to $\text{ageop}i$.

**$M2$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{extras1}
\end{matrix}
$$

**$M3$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{extras2}
\end{matrix}
$$

**$M4$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{TIA}
\end{matrix}
$$

**$M5$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{extras1} + \beta_{8} \text{extras2}
\end{matrix}
$$

**$M6$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{extras1} + \beta_{8} \text{TIA}
\end{matrix}
$$

**$M7$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{extras2} + \beta_{8} \text{TIA}
\end{matrix}
$$

**$M8$**

$$
\begin{matrix}
  \mathbb{E}(\text{price}) = & \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{kilometerop3} \\
   & + \beta_{4} \text{ageop1} + \beta_{5} \text{ageop2} + \beta_{6} \text{ageop3} \\
   & + \beta_{7} \text{extras1} + \beta_{8} \text{extras2} + \beta_{9} \text{TIA}
\end{matrix}
$$


```{r}
base_car_model <- "price ~ kilometerop1 + kilometerop2 + kilometerop3 + ageop1 + ageop2 + ageop3"
used_cars_model <- list(lm(base_car_model, data = used_cars))
extra_var <- c("extras1", "extras2", "TIA")
for (extra in extra_var){
  new_model <- paste0(base_car_model, " + ", extra)
  used_cars_model[[length(used_cars_model) + 1]] <- lm(new_model, data = used_cars)
}
for (tuple in combn(extra_var, 2, simplify = FALSE)){
  new_model <- paste0(base_car_model, " + ", tuple[1], " + ", tuple[2])
  used_cars_model[[length(used_cars_model) + 1]] <- lm(new_model, data = used_cars)
}
all_extras <- paste(extra_var, collapse = " + ")
used_cars_model[[length(used_cars_model) + 1]] <- lm(paste0(base_car_model, " + ", all_extras), data = used_cars)

cars_aic <- data.frame(model = seq(1, 8), aic = unlist(lapply(used_cars_model, AIC)))
cars_aic
```

9th model are obtained using _"leaps and  bounds"_ algorithm, already implemented on R:

```{r}
library(leaps)

leaps_model <- regsubsets(price ~ . -age -kilometer, data = used_cars, nvmax = 4)

summary(leaps_model)
```

As in the book the resulted model is:

**$M9$**

$$
\mathbb{E}(\text{price}) = \beta_{0} + \beta_{1} \text{kilometerop1} + \beta_{2} \text{kilometerop2} + \beta_{3} \text{ageop1} + \beta_{4} \text{ageop2}
$$

We add it to the table and plot the AIC:

```{r, fig.width=5, fig.height=5, fig.align='center'}
selected_var <- names(which(summary(leaps_model)$outmat[4, ] == "*"))
used_cars_model[[length(used_cars_model) + 1 ]] <- lm(
  paste0("price ~ ", paste(selected_var, collapse= " + ")), data = used_cars
)
cars_aic[9, ] <- c(9, AIC(used_cars_model[[9]]))
cars_aic

plot(
  cars_aic,
  main = 'AIC for the different models',
  xlab = 'model number',
  ylab = 'AIC',
  xaxt = 'n'
)
axis(1, at = 1:9)
```

Showing the age and kilometer effects as in Example 3.5.


```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))

beta_m9 <- used_cars_model[[9]]$coefficients

age_seq <- seq(min(used_cars$age), max(used_cars$age), length.out = 100)
kilometer_seq <- seq(min(used_cars$kilometer), max(used_cars$kilometer), length.out = 100)

age_effect <- function(age){
  i_poly_age <- predict(poly_age, age)
  beta_m9[4] * i_poly_age[, 1] + beta_m9[5] * i_poly_age[, 2]
}

kilometer_effect <- function(kilometer){
  i_poly_killometer <- predict(poly_kilometer, kilometer)
  beta_m9[2] * i_poly_killometer[, 1] + beta_m9[3] * i_poly_killometer[, 2]
}

residual_age <- used_cars$price - beta_m9[1] - kilometer_effect(used_cars$kilometer)
plot(
  used_cars$age,
  residual_age,
  main = 'effect of age including partial residuals',
  xlab = 'age in months',
  ylab = 'effect / partial residuals'
)
lines(age_seq, age_effect(age_seq), col = 'red', lwd = 2)

residual_kilometer <- used_cars$price - beta_m9[1] - age_effect(used_cars$age)
plot(
  used_cars$kilometer,
  residual_age,
  main = 'effect of kilometer reading including partial residuals',
  xlab = 'kilometer reading in 1000km',
  ylab = 'effect / partial residuals'
)
lines(kilometer_seq, kilometer_effect(kilometer_seq), col = 'red', lwd = 2)
```

## Example 3.20 Price of Used Cars — Model Diagnosis

The studentized residuals are calculated with the equation:

$$
r_{i}^{*} = \frac{\hat{\epsilon}_{i}}{\hat{\sigma}_{(i)} \sqrt{1 - h_{ii}}}
$$

But, you can get them using R:

```{r, fig.width=10, fig.height=8, fig.align='center'}
par(mfrow = c(2, 2))

t_residuals <- rstudent(used_cars_model[[9]])

n <- nrow(used_cars_model[[9]]$model)
p <- ncol(used_cars_model[[9]]$model)
t_critical <- qt(1 - 0.05 / (2 * n), n - p - 1)

plot_studentized_residuals <- function(against, ag_label, short_label){
  plot(
    against,
    t_residuals,
    main = paste0('studentized residuals verus ', short_label),
    xlab = ag_label,
    ylab = 'studentized residuals',
    ylim = c(min(c(-t_critical - 0.5, t_residuals)), max(c(t_critical + 0.5, t_residuals)))
  )
  abline(h = 0, col = 'red', lwd = 2)
  abline(h = c(-t_critical, t_critical), col = 'gray', lwd = 2)
}
plot_studentized_residuals(used_cars_model[[9]]$fitted.values, 'estimated sales price', 'estimated sales price')
plot_studentized_residuals(used_cars$kilometer, 'kilometer reading in 1000km', 'kilometer reading')
plot_studentized_residuals(used_cars$age, 'age in months', 'age in months')
plot_studentized_residuals(used_cars$TIA, 'months to the next TIA appointment', 'months to TIA')
```

For the standardized residuals:

$$
r_{i} = \frac{\hat{\epsilon}_{i}}{\hat{\sigma}\sqrt{1 - h_{ii}}}
$$

But using R:

```{r, fig.width=5, fig.height=5, fig.align='center'}
n_residuals <- rstandard(used_cars_model[[9]])

qqnorm(n_residuals, col = 'darkblue', main = '', xlab = 'quantiles of normal distribution', ylab = 'quantiles of residuals')
abline(0, 1, col = 'black', lwd = 2)
```

## Example 3.21 Prices of Used Cars — Collinearity

Now, we want the $VIF$:

$$
{VIF}_{j} = \frac{1}{1 - {R_{j}}^{2}}
$$

```{r}
library(car)
vif(used_cars_model[[9]])
```

All values $< 10$.

## Example 3.22 Prices of Used Cars—Outliers

See graph of example 3.20. Finding the outlier:

```{r}
t_residuals[t_residuals > t_critical]
```

## Example 3.23 Prices of Used Cars — Influence Analysis

Getting the $h_{ii}$ values:

$$
h_{ii} = \frac{1}{n} + {\left ( x_{i} - \overline{x} \right )}^{T} \left ( {\widetilde{X}_{j}}^{T} \widetilde{X}_{j} \right )^{-1} \left ( x_{i} - \overline{x} \right )
$$

```{r}
leverage <- hatvalues(used_cars_model[[9]])
head(leverage)
range(leverage)

h_critical <- 2 * p / n

big_leverage <- leverage > h_critical
leverage[big_leverage]
```

We can plot these points:

```{r, fig.width=10, fig.height=4, fig.align='center'}
par(mfrow = c(1, 2))

plot(
  used_cars$age,
  used_cars$price,
  main = 'Scatter plot: sales price versus age in months',
  xlab = 'age in months',
  ylab = 'sales prince in 1000 Euro'
)
points(used_cars$age[big_leverage], used_cars$price[big_leverage], col = 'red', pch = 16)

plot(
  used_cars$kilometer,
  used_cars$price,
  main = 'Scatter plot: sales price versus kilometer reading',
  xlab = 'kilometer reading in 1000 km',
  ylab = 'sales prince in 1000 Euro'
)
points(used_cars$kilometer[big_leverage], used_cars$price[big_leverage], col = 'red', pch = 16)
```

And the **Cook's Distance**:

$$
D_{i} = \frac{{\left ( \hat{y}_{(i)} - \hat{y} \right )}^{T} \left ( \hat{y}_{(i)} - \hat{y} \right )}{p \cdot {\hat{\sigma}}^{2}}
$$

```{r}
cook <- cooks.distance(used_cars_model[[9]])
head(cook)
range(cook)

big_cook <- cook > 0.5
cook[big_cook]
```

## Example 3.24 Prices of Used Cars — Alternative Models

Adjust model 9 for all observations, for $\text{kilometer} \leq 230$ and $\text{kilometer} \geq 50$:

```{r, fig.width=5, fig.height=4, fig.align='center'}
model_230 <- lm(price ~ kilometerop1 + kilometerop2 + ageop1 + ageop2, data = used_cars[used_cars$kilometer <= 230, ])
model_50 <- lm(price ~ kilometerop1 + kilometerop2 + ageop1 + ageop2, data = used_cars[used_cars$kilometer >= 50, ])

new_kilometer_effect <- function(kilometer, beta){
  i_poly_killometer <- predict(poly_kilometer, kilometer)
  beta[2] * i_poly_killometer[, 1] + beta[3] * i_poly_killometer[, 2]
}

plot(
  kilometer_seq,
  kilometer_effect(kilometer_seq),
  main = '',
  xlab = 'kilometer reading in 1000 km',
  ylab = 'effects',
  type = 'l',
  lwd = 2
)
lines(kilometer_seq, new_kilometer_effect(kilometer_seq, model_230$coefficients), col = 'red', lwd = 2)
lines(kilometer_seq, new_kilometer_effect(kilometer_seq, model_50$coefficients), col = 'green', lwd = 2)

legend(
  "topright",
  legend = c('LS with all observation', 'LS without kilometer>230', 'LS without kilometer<50'),
  col = c('black', 'red', 'green'),
  lwd = 2
)
```
